[llvm] deae979 - Revert "Reapply "[SLP] Schedule only sub-graph of vectorizable instructions"""

Thu Mar 3 11:35:43 PST 2022

Author: Philip Reames
Date: 2022-03-03T11:35:34-08:00
New Revision: deae979a2cfb55b0cc74f4c4569fdcd6135eb89f

URL: https://github.com/llvm/llvm-project/commit/deae979a2cfb55b0cc74f4c4569fdcd6135eb89f
DIFF: https://github.com/llvm/llvm-project/commit/deae979a2cfb55b0cc74f4c4569fdcd6135eb89f.diff

LOG: Revert "Reapply "[SLP] Schedule only sub-graph of vectorizable instructions"""

This reverts commit 738042711bc08cde9135873200b1d088e6cf11c3. A second, apparently separate, issue has been reported on the original review.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
    llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
    llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
    llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
    llvm/test/Transforms/SLPVectorizer/X86/align.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll
    llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
    llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
    llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
    llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
    llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
    llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
    llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
    llvm/test/Transforms/SLPVectorizer/X86/cse.ll
    llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
    llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
    llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
    llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
    llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
    llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll
    llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
    llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
    llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
    llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
    llvm/test/Transforms/SLPVectorizer/X86/fma.ll
    llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
    llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
    llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
    llvm/test/Transforms/SLPVectorizer/X86/fround.ll
    llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
    llvm/test/Transforms/SLPVectorizer/X86/gep.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
    llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
    llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
    llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
    llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
    llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
    llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
    llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
    llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
    llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
    llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
    llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
    llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
    llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
    llvm/test/Transforms/SLPVectorizer/X86/resched.ll
    llvm/test/Transforms/SLPVectorizer/X86/return.ll
    llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
    llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
    llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
    llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
    llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
    llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
    llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
    llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
    llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
    llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
    llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
    llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
    llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
    llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
    llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
    llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
    llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3ce495335c35d..ddcd0c3a496c4 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2779,8 +2779,7 @@ class BoUpSLP {
         }
         // Handle the memory dependencies.
         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
-          if (MemoryDepSD->hasValidDependencies() &&
-              MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
             // There are no more unscheduled dependencies after decrementing,
             // so we can put the dependent instruction into the ready list.
             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
@@ -2837,8 +2836,7 @@ class BoUpSLP {
     void initialFillReadyList(ReadyListType &ReadyList) {
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         doForAllOpcodes(I, [&](ScheduleData *SD) {
-          if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
-              SD->isReady()) {
+          if (SD->isSchedulingEntity() && SD->isReady()) {
             ReadyList.insert(SD);
             LLVM_DEBUG(dbgs()
                        << "SLP:    initially in ready list: " << *SD << "\n");
@@ -7985,11 +7983,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
   LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
 
-  // A key point - if we got here, pre-scheduling was able to find a valid
-  // scheduling of the sub-graph of the scheduling window which consists
-  // of all vector bundles and their transitive users.  As such, we do not
-  // need to reschedule anything *outside of* that subgraph.
-
   BS->resetSchedule();
 
   // For the real scheduling we use a more sophisticated ready-list: it is
@@ -8002,19 +7995,21 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   };
   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
 
-  // Ensure that all dependency data is updated (for nodes in the sub-graph)
-  // and fill the ready-list with initial instructions.
+  // Ensure that all dependency data is updated and fill the ready-list with
+  // initial instructions.
   int Idx = 0;
+  int NumToSchedule = 0;
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
-    BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
+    BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
       assert((isVectorLikeInstWithConstOps(SD->Inst) ||
               SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
-
-      if (SD->isSchedulingEntity() && SD->isPartOfBundle())
+      if (SD->isSchedulingEntity()) {
         BS->calculateDependencies(SD, false, this);
+        NumToSchedule++;
+      }
     });
   }
   BS->initialFillReadyList(ReadyInsts);
@@ -8037,7 +8032,9 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     }
 
     BS->schedule(picked, ReadyInsts);
+    NumToSchedule--;
   }
+  assert(NumToSchedule == 0 && "could not schedule all instructions");
 
   // Check that we didn't break any of our invariants.
 #ifdef EXPENSIVE_CHECKS

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
index 531fe4fb815a1..10883987aa758 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
@@ -9,11 +9,11 @@ define void @f(float* %r, float* %w) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0
 ; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1
-; CHECK-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
-; CHECK-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[R0]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
+; CHECK-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[W0]] to <2 x float>*
 ; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 7d38eb60d1f03..4bd0754bf9594 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -12,9 +12,9 @@ define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
@@ -57,9 +57,9 @@ define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index ec7b03af83f8b..536f72a73684e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -36,7 +36,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -86,6 +85,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -111,7 +111,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -161,6 +160,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -297,7 +297,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -347,6 +346,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -372,7 +372,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -422,6 +421,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 4be767ce01e84..4d0e0dfd69f14 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -37,9 +37,9 @@ define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias noca
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
@@ -163,9 +163,9 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_017]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
@@ -274,10 +274,10 @@ define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noali
 ; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 6
 ; CHECK-NEXT:    [[ARRAYIDX63:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 6
 ; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 7
-; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P1_044]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[P2_045]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
index c71da72317c9e..f4b027086265f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
@@ -18,7 +18,6 @@ define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonl
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SRC]] to <4 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
@@ -33,6 +32,7 @@ define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonl
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[DST]] to <4 x i8>*
 ; CHECK-NEXT:    store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
index c68c7732bbeac..97ae874e6c9a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
@@ -17,58 +17,58 @@ define void @wrap_mul4(double* nocapture %Out, [2 x double]* nocapture readonly
 ; CHECK-NEXT:    [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 1
-; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1
-; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2
-; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2
-; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3
-; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3
-; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0
-; CHECK-NEXT:    [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8
-; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1
-; CHECK-NEXT:    [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8
-; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[OUT]] to <2 x double>*
+; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2
+; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0
+; CHECK-NEXT:    [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1
+; CHECK-NEXT:    [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double* [[OUT]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP29]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2
 ; CHECK-NEXT:    [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>*
-; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP4]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>*
-; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP9]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fadd <2 x double> [[TMP15]], [[TMP18]]
-; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP19]], <2 x double>* [[TMP20]], align 8
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[TMP30]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4
 ; CHECK-NEXT:    [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 5
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[TEMP10]], i32 1
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> [[TMP2]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <2 x double> [[TMP24]], double [[TEMP11]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = fmul <2 x double> [[TMP7]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd <2 x double> [[TMP23]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP27]], <2 x double>* [[TMP28]], align 8
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP25]], <2 x double>* [[TMP31]], align 8
 ; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6
 ; CHECK-NEXT:    [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 7
-; CHECK-NEXT:    [[TMP29:%.*]] = fmul <2 x double> [[TMP14]], [[TMP22]]
-; CHECK-NEXT:    [[TMP30:%.*]] = fmul <2 x double> [[TMP17]], [[TMP25]]
-; CHECK-NEXT:    [[TMP31:%.*]] = fadd <2 x double> [[TMP29]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP31]], <2 x double>* [[TMP32]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[TMP32]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index aa61c095ecc7a..d6bb9bda4bba8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -326,10 +326,10 @@ define void @no_version(i32* nocapture %dst, i32* nocapture readonly %src) {
 ; CHECK-LABEL: @no_version(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr <2 x i32> [[TMP1]], <i32 16, i32 16>
+; CHECK-NEXT:    [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -902,7 +902,12 @@ define i32 @block_partly_vectorized_without_versioning(%struct.spam* readonly %a
 ; CHECK-NEXT:    [[A_GEP_14:%.*]] = getelementptr i8, i8* [[A]], i64 14
 ; CHECK-NEXT:    [[B_GEP_14:%.*]] = getelementptr i8, i8* [[B]], i64 14
 ; CHECK-NEXT:    [[A_GEP_15:%.*]] = getelementptr i8, i8* [[A]], i64 15
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[B_GEP_15:%.*]] = getelementptr i8, i8* [[B]], i64 15
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[R_GEP_0:%.*]] = getelementptr i8, i8* [[ARG1]], i64 0
 ; CHECK-NEXT:    [[R_GEP_1:%.*]] = getelementptr i8, i8* [[ARG1]], i64 1
 ; CHECK-NEXT:    [[R_GEP_2:%.*]] = getelementptr i8, i8* [[ARG1]], i64 2
@@ -919,11 +924,6 @@ define i32 @block_partly_vectorized_without_versioning(%struct.spam* readonly %a
 ; CHECK-NEXT:    [[R_GEP_13:%.*]] = getelementptr i8, i8* [[ARG1]], i64 13
 ; CHECK-NEXT:    [[R_GEP_14:%.*]] = getelementptr i8, i8* [[ARG1]], i64 14
 ; CHECK-NEXT:    [[R_GEP_15:%.*]] = getelementptr i8, i8* [[ARG1]], i64 15
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[R_GEP_0]] to <16 x i8>*
 ; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 1
 ; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 15

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
index a631a9732a203..dfd322013399f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
@@ -13,14 +13,14 @@ define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
index 3f5d5147fd508..2f7076dd25ca3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll
@@ -21,9 +21,9 @@ define i8 @reduce_and(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
index f0cc2fc8d4022..53126ee407e98 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll
@@ -21,9 +21,9 @@ define i8 @reduce_or(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
index 0136453298b79..2c59e57cec56a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll
@@ -21,9 +21,9 @@ define i8 @reduce_xor(%struct.buf* %a, %struct.buf* %b) {
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[TMP1]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
index 05b1bf2a45b14..39f2f885bc26b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
@@ -15,13 +15,13 @@ define void @patatino(i64 %n, i64 %i, %struct.S* %p) !dbg !7 {
 ; CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]]
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]]
 ; CHECK-NEXT:    [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG29:![0-9]+]]
-; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG30:![0-9]+]]
-; CHECK-NEXT:    [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG31:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA32:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA29:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG34:![0-9]+]]
+; CHECK-NEXT:    [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG35:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg [[DBG36:![0-9]+]]
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA32]]
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA29]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG37:![0-9]+]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
index 96b3aa2509c4d..8e0ca4b293841 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll
@@ -15,15 +15,15 @@ define void @test(i64* %ptr, i64* noalias %res) {
 ; CHECK-NEXT:    [[CALL_I_I:%.*]] = call i32* @get_ptr()
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 2
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 1
-; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3
-; CHECK-NEXT:    [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 2
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[GEP_1]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[RES]] to <2 x i64>*
 ; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index 24c69fb667272..b1da97e0f96ab 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -135,13 +135,13 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_31]]
@@ -171,12 +171,12 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP6]]
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 962f8ca42bf86..c670673e95936 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -135,13 +135,13 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_31]]
@@ -171,12 +171,12 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP6]]
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
index b5f4dad4e712d..2b634ae718277 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
@@ -33,9 +33,9 @@ define i32 @s352() {
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX6]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
@@ -48,9 +48,9 @@ define i32 @s352() {
 ; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP9]]
-; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP11]], [[TMP13]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
index 4821ae9c64fd3..3129ba79dafaa 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll
@@ -25,6 +25,14 @@ define void @PR50256(i8* %a, i16* %b, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[A]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[ARRAYIDX_8]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw <8 x i16> [[TMP5]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw <8 x i16> [[TMP6]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 3
@@ -40,18 +48,10 @@ define void @PR50256(i8* %a, i16* %b, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX3_13:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 13
 ; CHECK-NEXT:    [[ARRAYIDX3_14:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX3_15:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 15
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[A]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i16> [[TMP3]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX_8]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[TMP7]] to <8 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw <8 x i16> [[TMP8]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-; CHECK-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16* [[B]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* [[TMP9]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3_8]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2
+; CHECK-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* [[TMP10]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx.1 = getelementptr inbounds i8, i8* %a, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
index 7ab2df33692ef..4cc004ed9862a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -191,10 +191,10 @@ define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, ha
 ; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
 ; GCN-NEXT:    [[I1:%.*]] = load half, half addrspace(3)* [[B:%.*]], align 2
 ; GCN-NEXT:    [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
-; GCN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1
-; GCN-NEXT:    [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2
 ; GCN-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)*
 ; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2
+; GCN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1
+; GCN-NEXT:    [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2
 ; GCN-NEXT:    [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)*
 ; GCN-NEXT:    [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2
 ; GCN-NEXT:    [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
index 09ffb51df081b..c45a900ccc555 100644
--- a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@@ -14,11 +14,11 @@ define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %a
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half*
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], <half 0xH5380, half 0xH5380>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], <half 0xH57F0, half 0xH57F0>
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>*
 ; CHECK-NEXT:    store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
index 4812df4cb5e87..d334197ad9c7b 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -14,13 +14,13 @@ define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:    [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4
-; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    unreachable

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
index 12f5470dacd07..741dbcec392e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -4,12 +4,12 @@
 define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
 ; CHECK-LABEL: @i64_simplified(
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
@@ -33,12 +33,12 @@ define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
 define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_reversed(
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
@@ -62,12 +62,12 @@ define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
 define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_extract(
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index bc8abf3a92eec..7668747a75ace 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -54,11 +54,9 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
 ; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
-; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
-; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]])
 ; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
 ; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
@@ -90,7 +88,9 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
 ; FORCE_REDUCTION-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP2]]
+; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
 ; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]]
+; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
 ; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0
 ; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
 ; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[VAL_41]], i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 1e2c9e8402376..ebbbefc9f81f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -302,21 +302,21 @@ define void @reorder_alt_subTree() #0 {
 define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
 ; CHECK-LABEL: @reorder_alt_rightsubTree(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[D]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[A]] to <2 x double>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[B]] to <2 x double>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fsub <2 x double> [[TMP11]], [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP11]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %1 = load double, double* %a

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/align.ll b/llvm/test/Transforms/SLPVectorizer/X86/align.ll
index 90091b5af8ada..a4ddfa1989d9a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/align.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/align.ll
@@ -12,9 +12,9 @@ define void @test1(double* %a, double* %b, double* %c) {
 ; CHECK-NEXT:    [[STORE1:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 1
 ; CHECK-NEXT:    [[STORE2:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
@@ -48,11 +48,11 @@ define void @test2(float * %a, float * %b) {
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
 ; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
 ; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
index 9794ca999acad..fc78797f5f7d3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll
@@ -28,40 +28,40 @@ declare i8  @llvm.abs.i8 (i8, i1)
 define void @abs_v8i64() {
 ; SSE-LABEL: @abs_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
-; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
-; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP5]], i1 false)
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP7]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false)
+; SSE-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)
+; SSE-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SSE-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
-; SLM-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
-; SLM-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP5]], i1 false)
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP7]], i1 false)
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false)
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false)
+; SLM-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false)
+; SLM-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false)
+; SLM-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false)
-; AVX-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP3]], i1 false)
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false)
+; AVX-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP2]], i1 false)
+; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -101,40 +101,40 @@ define void @abs_v8i64() {
 define void @abs_v16i32() {
 ; SSE-LABEL: @abs_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP5]], i1 false)
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP7]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 false)
+; SSE-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 false)
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP5]], i1 false)
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP7]], i1 false)
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 false)
+; SLM-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SLM-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 false)
+; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP3]], i1 false)
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -198,40 +198,40 @@ define void @abs_v16i32() {
 define void @abs_v32i16() {
 ; SSE-LABEL: @abs_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP5]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP7]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP2]], i1 false)
+; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP4]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP5]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SLM-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
-; SLM-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP5]], i1 false)
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP7]], i1 false)
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP2]], i1 false)
+; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false)
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP4]], i1 false)
+; SLM-NEXT:    store <8 x i16> [[TMP5]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP7]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP1]], i1 false)
-; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP3]], i1 false)
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP1]], i1 false)
+; AVX-NEXT:    [[TMP4:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP2]], i1 false)
+; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP4]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -343,40 +343,40 @@ define void @abs_v32i16() {
 define void @abs_v64i8() {
 ; SSE-LABEL: @abs_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP5]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP7]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP2]], i1 false)
+; SSE-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP4]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP8]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @abs_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SLM-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
-; SLM-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP5]], i1 false)
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP7]], i1 false)
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP2]], i1 false)
+; SLM-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false)
+; SLM-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP4]], i1 false)
+; SLM-NEXT:    store <16 x i8> [[TMP5]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP8]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @abs_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP1]], i1 false)
-; AVX-NEXT:    store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP3]], i1 false)
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP1]], i1 false)
+; AVX-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP2]], i1 false)
+; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP4]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
index a2bbec2cd3a7f..66154522c327d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -98,12 +98,12 @@ define void @add_v8i64() {
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -190,50 +190,50 @@ define void @add_v8i64() {
 define void @add_v16i32() {
 ; SSE-LABEL: @add_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -314,50 +314,50 @@ define void @add_v16i32() {
 define void @add_v32i16() {
 ; SSE-LABEL: @add_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -502,50 +502,50 @@ define void @add_v32i16() {
 define void @add_v64i8() {
 ; SSE-LABEL: @add_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
index 203270d73c759..fceb1cfc69017 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -63,12 +63,12 @@ define void @add_v8i64() {
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,31 +117,31 @@ define void @add_v8i64() {
 define void @add_v16i32() {
 ; SSE-LABEL: @add_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -222,31 +222,31 @@ define void @add_v16i32() {
 define void @add_v32i16() {
 ; SSE-LABEL: @add_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -391,31 +391,31 @@ define void @add_v32i16() {
 define void @add_v64i8() {
 ; SSE-LABEL: @add_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
index 316c323f44f09..085e9adeaeef6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -25,50 +25,50 @@
 define void @add_v8i64() {
 ; SSE-LABEL: @add_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,50 +117,50 @@ define void @add_v8i64() {
 define void @add_v16i32() {
 ; SSE-LABEL: @add_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -241,50 +241,50 @@ define void @add_v16i32() {
 define void @add_v32i16() {
 ; SSE-LABEL: @add_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -429,50 +429,50 @@ define void @add_v32i16() {
 define void @add_v64i8() {
 ; SSE-LABEL: @add_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = add <32 x i8> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = add <32 x i8> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll
index 0cb304d7859a7..a1386e30170e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll
@@ -25,40 +25,40 @@
 define void @sdiv_v16i32_uniformconst() {
 ; SSE-LABEL: @sdiv_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sdiv_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sdiv_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = sdiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = sdiv <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = sdiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = sdiv <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -122,40 +122,40 @@ define void @sdiv_v16i32_uniformconst() {
 define void @srem_v16i32_uniformconst() {
 ; SSE-LABEL: @srem_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @srem_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @srem_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = srem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = srem <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = srem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = srem <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -219,40 +219,40 @@ define void @srem_v16i32_uniformconst() {
 define void @udiv_v16i32_uniformconst() {
 ; SSE-LABEL: @udiv_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @udiv_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @udiv_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = udiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = udiv <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = udiv <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = udiv <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -316,40 +316,40 @@ define void @udiv_v16i32_uniformconst() {
 define void @urem_v16i32_uniformconst() {
 ; SSE-LABEL: @urem_v16i32_uniformconst(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SSE-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @urem_v16i32_uniformconst(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP5]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], <i32 5, i32 5, i32 5, i32 5>
+; SLM-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @urem_v16i32_uniformconst(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = urem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = urem <8 x i32> [[TMP3]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = urem <8 x i32> [[TMP1]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = urem <8 x i32> [[TMP2]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
index e2c2c9a1ff32b..acd9979a6fbbd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
@@ -28,69 +28,69 @@ declare i8  @llvm.smul.fix.i8 (i8 , i8 , i32)
 define void @smul_v8i64() {
 ; SSE-LABEL: @smul_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smul_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @smul_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @smul_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
-; AVX2-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -103,12 +103,12 @@ define void @smul_v8i64() {
 ;
 ; AVX256BW-LABEL: @smul_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
-; AVX256BW-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
 ;
@@ -150,20 +150,20 @@ define void @smul_v8i64() {
 define void @smul_v16i32() {
 ; SSE-LABEL: @smul_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3)
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -236,12 +236,12 @@ define void @smul_v16i32() {
 ;
 ; AVX-LABEL: @smul_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], i32 3)
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]], i32 3)
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -322,50 +322,50 @@ define void @smul_v16i32() {
 define void @smul_v32i16() {
 ; SSE-LABEL: @smul_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smul_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smul_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], i32 3)
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]], i32 3)
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -510,50 +510,50 @@ define void @smul_v32i16() {
 define void @smul_v64i8() {
 ; SSE-LABEL: @smul_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smul_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smul_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], i32 3)
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]], i32 3)
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]], i32 3)
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -831,69 +831,69 @@ declare i8  @llvm.umul.fix.i8 (i8 , i8 , i32)
 define void @umul_v8i64() {
 ; SSE-LABEL: @umul_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umul_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX1-LABEL: @umul_v8i64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3)
-; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3)
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @umul_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
-; AVX2-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -906,12 +906,12 @@ define void @umul_v8i64() {
 ;
 ; AVX256BW-LABEL: @umul_v8i64(
 ; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3)
-; AVX256BW-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3)
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256BW-NEXT:    ret void
 ;
@@ -953,20 +953,20 @@ define void @umul_v8i64() {
 define void @umul_v16i32() {
 ; SSE-LABEL: @umul_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3)
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3)
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -1039,12 +1039,12 @@ define void @umul_v16i32() {
 ;
 ; AVX-LABEL: @umul_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], i32 3)
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]], i32 3)
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -1125,50 +1125,50 @@ define void @umul_v16i32() {
 define void @umul_v32i16() {
 ; SSE-LABEL: @umul_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umul_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3)
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3)
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umul_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], i32 3)
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]], i32 3)
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -1313,50 +1313,50 @@ define void @umul_v32i16() {
 define void @umul_v64i8() {
 ; SSE-LABEL: @umul_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umul_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umul_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], i32 3)
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]], i32 3)
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]], i32 3)
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
index 053fa937f917c..c38794dae9253 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -95,31 +95,31 @@ define void @mul_v8i64() {
 ;
 ; AVX128-LABEL: @mul_v8i64(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP2]]
-; AVX128-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[TMP4]], [[TMP5]]
-; AVX128-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <2 x i64> [[TMP7]], [[TMP8]]
-; AVX128-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[TMP10]], [[TMP11]]
+; AVX128-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <2 x i64> [[TMP1]], [[TMP5]]
+; AVX128-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[TMP2]], [[TMP6]]
+; AVX128-NEXT:    [[TMP11:%.*]] = mul <2 x i64> [[TMP3]], [[TMP7]]
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <2 x i64> [[TMP4]], [[TMP8]]
+; AVX128-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX128-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX128-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; AVX128-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v8i64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP3:%.*]] = mul <4 x i64> [[TMP1]], [[TMP2]]
-; AVX256-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[TMP4]], [[TMP5]]
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]]
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]]
+; AVX256-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX256-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256-NEXT:    ret void
 ;
@@ -168,69 +168,69 @@ define void @mul_v8i64() {
 define void @mul_v16i32() {
 ; SSE-LABEL: @mul_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX128-LABEL: @mul_v16i32(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]]
-; AVX128-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]]
-; AVX128-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]]
-; AVX128-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]]
+; AVX128-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; AVX128-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; AVX128-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; AVX128-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX128-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX128-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; AVX128-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v16i32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = mul <8 x i32> [[TMP1]], [[TMP2]]
-; AVX256-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP4]], [[TMP5]]
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]]
+; AVX256-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX256-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -311,69 +311,69 @@ define void @mul_v16i32() {
 define void @mul_v32i16() {
 ; SSE-LABEL: @mul_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX128-LABEL: @mul_v32i16(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]]
-; AVX128-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
-; AVX128-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]]
-; AVX128-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]]
+; AVX128-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; AVX128-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; AVX128-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; AVX128-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; AVX128-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; AVX128-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; AVX128-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v32i16(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP3:%.*]] = mul <16 x i16> [[TMP1]], [[TMP2]]
-; AVX256-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP4]], [[TMP5]]
+; AVX256-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX256-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
+; AVX256-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX256-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX256-NEXT:    ret void
 ;
@@ -518,69 +518,69 @@ define void @mul_v32i16() {
 define void @mul_v64i8() {
 ; SSE-LABEL: @mul_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @mul_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX128-LABEL: @mul_v64i8(
 ; AVX128-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]]
-; AVX128-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]]
-; AVX128-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]]
-; AVX128-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; AVX128-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]]
-; AVX128-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; AVX128-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
+; AVX128-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
+; AVX128-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
+; AVX128-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
+; AVX128-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; AVX128-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; AVX128-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; AVX128-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; AVX128-NEXT:    ret void
 ;
 ; AVX256-LABEL: @mul_v64i8(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP3:%.*]] = mul <32 x i8> [[TMP1]], [[TMP2]]
-; AVX256-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX256-NEXT:    [[TMP6:%.*]] = mul <32 x i8> [[TMP4]], [[TMP5]]
+; AVX256-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX256-NEXT:    [[TMP5:%.*]] = mul <32 x i8> [[TMP1]], [[TMP3]]
+; AVX256-NEXT:    [[TMP6:%.*]] = mul <32 x i8> [[TMP2]], [[TMP4]]
+; AVX256-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX256-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll
index d71a3cb239ef9..a773ab657caa1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll
@@ -63,31 +63,31 @@ define void @smax_v8i64() {
 ;
 ; SLM-LABEL: @smax_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @smax_v8i64() {
 define void @smax_v16i32() {
 ; SSE-LABEL: @smax_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smax_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @smax_v16i32() {
 define void @smax_v32i16() {
 ; SSE-LABEL: @smax_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smax_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @smax_v32i16() {
 define void @smax_v64i8() {
 ; SSE-LABEL: @smax_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smax_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smax_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll
index af2c560d12a98..1ead7570ca3c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll
@@ -63,31 +63,31 @@ define void @smin_v8i64() {
 ;
 ; SLM-LABEL: @smin_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @smin_v8i64() {
 define void @smin_v16i32() {
 ; SSE-LABEL: @smin_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smin_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @smin_v16i32() {
 define void @smin_v32i16() {
 ; SSE-LABEL: @smin_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smin_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @smin_v32i16() {
 define void @smin_v64i8() {
 ; SSE-LABEL: @smin_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @smin_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @smin_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
index d25eccd882079..88f18cba2b2ee 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -98,12 +98,12 @@ define void @sub_v8i64() {
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -190,50 +190,50 @@ define void @sub_v8i64() {
 define void @sub_v16i32() {
 ; SSE-LABEL: @sub_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -314,50 +314,50 @@ define void @sub_v16i32() {
 define void @sub_v32i16() {
 ; SSE-LABEL: @sub_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -502,50 +502,50 @@ define void @sub_v32i16() {
 define void @sub_v64i8() {
 ; SSE-LABEL: @sub_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
index 79bba89de18a6..afe17947cc3af 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -63,12 +63,12 @@ define void @sub_v8i64() {
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,31 +117,31 @@ define void @sub_v8i64() {
 define void @sub_v16i32() {
 ; SSE-LABEL: @sub_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -222,31 +222,31 @@ define void @sub_v16i32() {
 define void @sub_v32i16() {
 ; SSE-LABEL: @sub_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -391,31 +391,31 @@ define void @sub_v32i16() {
 define void @sub_v64i8() {
 ; SSE-LABEL: @sub_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
index a98c1234d3611..11537e3965af4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -25,50 +25,50 @@
 define void @sub_v8i64() {
 ; SSE-LABEL: @sub_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = sub <4 x i64> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -117,50 +117,50 @@ define void @sub_v8i64() {
 define void @sub_v16i32() {
 ; SSE-LABEL: @sub_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -241,50 +241,50 @@ define void @sub_v16i32() {
 define void @sub_v32i16() {
 ; SSE-LABEL: @sub_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP10]], [[TMP11]]
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -429,50 +429,50 @@ define void @sub_v32i16() {
 define void @sub_v64i8() {
 ; SSE-LABEL: @sub_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = sub <16 x i8> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = sub <16 x i8> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = sub <32 x i8> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = sub <32 x i8> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = sub <32 x i8> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <32 x i8> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll
index a481971db382d..5202e41fd770b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll
@@ -63,31 +63,31 @@ define void @umax_v8i64() {
 ;
 ; SLM-LABEL: @umax_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @umax_v8i64() {
 define void @umax_v16i32() {
 ; SSE-LABEL: @umax_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umax_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @umax_v16i32() {
 define void @umax_v32i16() {
 ; SSE-LABEL: @umax_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umax_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @umax_v32i16() {
 define void @umax_v64i8() {
 ; SSE-LABEL: @umax_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umax_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umax_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll
index bbed2195a4f37..bfa6c1f590af2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll
@@ -63,31 +63,31 @@ define void @umin_v8i64() {
 ;
 ; SLM-LABEL: @umin_v8i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; SLM-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -136,50 +136,50 @@ define void @umin_v8i64() {
 define void @umin_v16i32() {
 ; SSE-LABEL: @umin_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umin_v16i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
-; SLM-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
-; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -260,50 +260,50 @@ define void @umin_v16i32() {
 define void @umin_v32i16() {
 ; SSE-LABEL: @umin_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umin_v32i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SLM-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -448,50 +448,50 @@ define void @umin_v32i16() {
 define void @umin_v64i8() {
 ; SSE-LABEL: @umin_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @umin_v64i8(
 ; SLM-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-; SLM-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SLM-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @umin_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]])
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]])
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
index 7aa4d6728472b..721a98bb46059 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
@@ -40,10 +40,10 @@ define void @bitreverse_2i64() #0 {
 define void @bitreverse_4i64() #0 {
 ; SSE-LABEL: @bitreverse_4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
-; SSE-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP2]])
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
 ; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -99,10 +99,10 @@ define void @bitreverse_4i32() #0 {
 define void @bitreverse_8i32() #0 {
 ; SSE-LABEL: @bitreverse_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -182,10 +182,10 @@ define void @bitreverse_8i16() #0 {
 define void @bitreverse_16i16() #0 {
 ; SSE-LABEL: @bitreverse_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -313,10 +313,10 @@ define void @bitreverse_16i8() #0 {
 define void @bitreverse_32i8() #0 {
 ; SSE-LABEL: @bitreverse_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
index cd8ae3540f7da..458193e5097cb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -11,22 +11,22 @@ define void @test(i32* %0, i32* %1, i32* %2) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP2:%.*]], i64 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 3
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP20]]
-; CHECK-NEXT:    [[TMP24:%.*]] = sub <4 x i32> [[TMP23]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[TMP25]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = sub <4 x i32> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add <4 x i32> [[TMP21]], [[TMP13]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP2:%.*]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 3
+; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[TMP22]], <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = sub <4 x i32> [[TMP22]], <i32 0, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub <4 x i32> [[TMP28]], zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
index 6ec2cf3738117..03717ad13d82f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll
@@ -17,15 +17,15 @@ define void @bcast_vals(i64 *%A, i64 *%B, i64 *%S) {
 ; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* [[B:%.*]], align 8
 ; CHECK-NEXT:    [[V1:%.*]] = sub i64 [[A0]], 1
 ; CHECK-NEXT:    [[V2:%.*]] = sub i64 [[B0]], 1
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1
-; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2
-; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1
+; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2
+; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[IDXS0]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    ret void
@@ -66,15 +66,11 @@ define void @bcast_vals2(i16 *%A, i16 *%B, i16 *%C, i16 *%D, i16 *%E, i32 *%S) {
 ; CHECK-LABEL: @bcast_vals2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* [[A:%.*]], align 8
-; CHECK-NEXT:    [[V1:%.*]] = sext i16 [[A0]] to i32
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1
-; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2
-; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3
 ; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* [[B:%.*]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load i16, i16* [[C:%.*]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load i16, i16* [[D:%.*]], align 8
 ; CHECK-NEXT:    [[E0:%.*]] = load i16, i16* [[E:%.*]], align 8
+; CHECK-NEXT:    [[V1:%.*]] = sext i16 [[A0]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2
@@ -83,6 +79,10 @@ define void @bcast_vals2(i16 *%A, i16 *%B, i16 *%C, i16 *%D, i16 *%E, i32 *%S) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]]
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1
+; CHECK-NEXT:    [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2
+; CHECK-NEXT:    [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[IDXS0]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
index 92a9e431c6e1c..bdb384f6fa9c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
@@ -103,10 +103,10 @@ define void @bswap_4i32() #0 {
 define void @bswap_8i32() #0 {
 ; SSE-LABEL: @bswap_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -180,10 +180,10 @@ define void @bswap_8i16() #0 {
 define void @bswap_16i16() #0 {
 ; SSE-LABEL: @bswap_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
index 6685fe260150c..2fdef624d48f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
@@ -23,21 +23,21 @@ define void @foo(i8* %v0, i8* readonly %v1) {
 ; CHECK-NEXT:    [[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9
 ; CHECK-NEXT:    [[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 10
 ; CHECK-NEXT:    [[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 11
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[TMP2]], <i64 4, i64 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i64> [[TMP6]], <i64 6, i64 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 4, i32 4, i32 6, i32 7>
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 4, i32 4, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <2 x i64> [[TMP4]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <2 x i64> [[TMP6]], <i64 6, i64 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[T21]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %t0 = bitcast i8* %v0 to i32*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
index 737d8d1082118..8f57fe6866bd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -21,9 +21,9 @@ define void @foo_3double(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL]], 1
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
@@ -85,9 +85,9 @@ define void @foo_2double(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL]], 1
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
@@ -139,9 +139,9 @@ define void @foo_4float(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD37:%.*]] = add nsw i32 [[MUL]], 3
 ; CHECK-NEXT:    [[IDXPROM38:%.*]] = sext i32 [[ADD37]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM38]]
-; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
@@ -295,9 +295,9 @@ define void @foo_2double_non_power_of_2(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[MUL]], 7
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD7]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
@@ -343,9 +343,9 @@ define void @foo_2double_non_power_of_2_zext(i32 %u) #0 {
 ; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[MUL]], 7
 ; CHECK-NEXT:    [[IDXPROM12:%.*]] = zext i32 [[ADD7]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
index bb60df259f17d..75248e1d7fa5c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
@@ -9,13 +9,13 @@ define void @test1(double* %a, double* %b, double* %c, double* %d) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[A]] to <4 x i32>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
index 7b6e6ca3c61af..de371d8895c7d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
@@ -4,12 +4,20 @@
 define void @exceed(double %0, double %1) {
 ; CHECK-LABEL: @exceed(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
 ; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
 ; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
@@ -19,21 +27,13 @@ define void @exceed(double %0, double %1) {
 ; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
-; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
index e6878e140a2f3..e3ff057355537 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -93,12 +93,12 @@ for.end48:                                        ; preds = %for.end44
 define void @zot(%struct.hoge* %arg) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index 723c12d9b05b6..9c8fbf8a2ed90 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -36,13 +36,13 @@ define void @main() #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> <double poison, double undef>, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double poison, double undef>, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
@@ -114,7 +114,6 @@ define void @_Z8radianceRK3RayiPt() #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
-; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double undef, double poison>, double undef, i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]]
@@ -123,6 +122,7 @@ define void @_Z8radianceRK3RayiPt() #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; CHECK-NEXT:    br label [[RETURN:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
index de6a2c169338f..a6d6dc2c1a5b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -16,21 +16,21 @@ define i32 @test(double* nocapture %G) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[G:%.*]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret i32 undef
@@ -133,24 +133,24 @@ define i32 @test2(double* nocapture %G, i32 %k) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], <double 1.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8
 ; CHECK-NEXT:    br label [[TMP24:%.*]]
 ; CHECK:       14:
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP17:%.*]] = load double, double* [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> [[TMP20]], double [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP22]], <2 x double>* [[TMP23]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8
 ; CHECK-NEXT:    br label [[TMP24]]
 ; CHECK:       24:
 ; CHECK-NEXT:    ret i32 undef
@@ -267,10 +267,10 @@ define i32 @partial_mrg(double* nocapture %A, i32 %n) {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[N]], 4
-; CHECK-NEXT:    [[CONV12:%.*]] = sitofp i32 [[ADD]] to double
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[N]], 4
+; CHECK-NEXT:    [[CONV12:%.*]] = sitofp i32 [[ADD]] to double
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 210ebe84d4f22..490263a396986 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -168,10 +168,10 @@ define void @ctlz_8i32() #0 {
 ;
 ; SSE42-LABEL: @ctlz_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 false)
+; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -245,10 +245,10 @@ define void @ctlz_8i16() #0 {
 define void @ctlz_16i16() #0 {
 ; SSE-LABEL: @ctlz_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -370,10 +370,10 @@ define void @ctlz_16i8() #0 {
 define void @ctlz_32i8() #0 {
 ; SSE-LABEL: @ctlz_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -630,10 +630,10 @@ define void @ctlz_undef_8i32() #0 {
 ;
 ; SSE42-LABEL: @ctlz_undef_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 true)
+; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 true)
+; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -707,10 +707,10 @@ define void @ctlz_undef_8i16() #0 {
 define void @ctlz_undef_16i16() #0 {
 ; SSE-LABEL: @ctlz_undef_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 true)
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 true)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -832,10 +832,10 @@ define void @ctlz_undef_16i8() #0 {
 define void @ctlz_undef_32i8() #0 {
 ; SSE-LABEL: @ctlz_undef_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
-; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 true)
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true)
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
index d87fa110531b0..973512922ea03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
@@ -63,10 +63,10 @@ define void @ctpop_2i64() #0 {
 define void @ctpop_4i64() #0 {
 ; SSE2-LABEL: @ctpop_4i64(
 ; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
-; SSE2-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
-; SSE2-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
-; SSE2-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP3]])
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
+; SSE2-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]])
+; SSE2-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
 ; SSE2-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE2-NEXT:    ret void
 ;
@@ -182,10 +182,10 @@ define void @ctpop_4i32() #0 {
 define void @ctpop_8i32() #0 {
 ; SSE2-LABEL: @ctpop_8i32(
 ; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
-; SSE2-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP3]])
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE2-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
+; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]])
+; SSE2-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 ; SSE2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE2-NEXT:    ret void
 ;
@@ -313,10 +313,10 @@ define void @ctpop_8i16() #0 {
 define void @ctpop_16i16() #0 {
 ; SSE-LABEL: @ctpop_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -438,10 +438,10 @@ define void @ctpop_16i8() #0 {
 define void @ctpop_32i8() #0 {
 ; SSE-LABEL: @ctpop_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
-; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP2]])
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
index bd584fc79c63c..7cbc4b1b88530 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -168,10 +168,10 @@ define void @cttz_8i32() #0 {
 ;
 ; SSE42-LABEL: @cttz_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP3]], i1 false)
+; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false)
+; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -245,10 +245,10 @@ define void @cttz_8i16() #0 {
 define void @cttz_16i16() #0 {
 ; SSE-LABEL: @cttz_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false)
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -370,10 +370,10 @@ define void @cttz_16i8() #0 {
 define void @cttz_32i8() #0 {
 ; SSE-LABEL: @cttz_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false)
-; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP3]], i1 false)
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 false)
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -630,10 +630,10 @@ define void @cttz_undef_8i32() #0 {
 ;
 ; SSE42-LABEL: @cttz_undef_8i32(
 ; SSE42-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
-; SSE42-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
-; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP3]], i1 true)
+; SSE42-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE42-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true)
+; SSE42-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true)
+; SSE42-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
 ; SSE42-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE42-NEXT:    ret void
 ;
@@ -707,10 +707,10 @@ define void @cttz_undef_8i16() #0 {
 define void @cttz_undef_16i16() #0 {
 ; SSE-LABEL: @cttz_undef_16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true)
-; SSE-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP3]], i1 true)
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 true)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
@@ -832,10 +832,10 @@ define void @cttz_undef_16i8() #0 {
 define void @cttz_undef_32i8() #0 {
 ; SSE-LABEL: @cttz_undef_32i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true)
-; SSE-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP3]], i1 true)
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 true)
+; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
index 545aaf3cb411f..554170236184b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -21,12 +21,12 @@ define i32 @foo(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i3
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret i32 0
@@ -72,12 +72,12 @@ define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
@@ -116,12 +116,12 @@ define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
index 63ba0bc6af7f7..830b882dac096 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll
@@ -7,10 +7,10 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
index c4d3bbe297c51..01b75f4f9806e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll
@@ -7,10 +7,10 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0
@@ -37,10 +37,10 @@ define i32 @diamond_broadcast2(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0
@@ -67,10 +67,10 @@ define i32 @diamond_broadcast3(i32* noalias nocapture %B, i32* noalias nocapture
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    ret i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/
diff erent-vec-widths.ll b/llvm/test/Transforms/SLPVectorizer/X86/
diff erent-vec-widths.ll
index 849a8365094b5..c27fad02c077b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/
diff erent-vec-widths.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/
diff erent-vec-widths.ll
@@ -26,19 +26,19 @@ define void @PR28457(double* noalias nocapture align 32 %q, double* noalias noca
 ; SSE-NEXT:    [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5
 ; SSE-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>*
 ; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00>
-; SSE-NEXT:    [[TMP4:%.*]] = bitcast double* [[Q0]] to <2 x double>*
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
-; SSE-NEXT:    [[TMP5:%.*]] = bitcast double* [[P2]] to <2 x double>*
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast double* [[P2]] to <2 x double>*
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; SSE-NEXT:    [[TMP5:%.*]] = bitcast double* [[P4]] to <2 x double>*
 ; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
-; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 1.000000e+00, double 1.000000e+00>
-; SSE-NEXT:    [[TMP8:%.*]] = bitcast double* [[Q2]] to <2 x double>*
-; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
-; SSE-NEXT:    [[TMP9:%.*]] = bitcast double* [[P4]] to <2 x double>*
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8
-; SSE-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 1.000000e+00>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00>
+; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00>
+; SSE-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP6]], <double 1.000000e+00, double 1.000000e+00>
+; SSE-NEXT:    [[TMP10:%.*]] = bitcast double* [[Q0]] to <2 x double>*
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP10]], align 8
+; SSE-NEXT:    [[TMP11:%.*]] = bitcast double* [[Q2]] to <2 x double>*
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP11]], align 8
 ; SSE-NEXT:    [[TMP12:%.*]] = bitcast double* [[Q4]] to <2 x double>*
-; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP12]], align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @PR28457(
@@ -56,14 +56,14 @@ define void @PR28457(double* noalias nocapture align 32 %q, double* noalias noca
 ; AVX-NEXT:    [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <4 x double>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
-; AVX-NEXT:    [[TMP4:%.*]] = bitcast double* [[Q0]] to <4 x double>*
-; AVX-NEXT:    store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8
-; AVX-NEXT:    [[TMP5:%.*]] = bitcast double* [[P4]] to <2 x double>*
-; AVX-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
-; AVX-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 1.000000e+00, double 1.000000e+00>
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast double* [[P4]] to <2 x double>*
+; AVX-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; AVX-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP2]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; AVX-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00>
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast double* [[Q0]] to <4 x double>*
+; AVX-NEXT:    store <4 x double> [[TMP5]], <4 x double>* [[TMP7]], align 8
 ; AVX-NEXT:    [[TMP8:%.*]] = bitcast double* [[Q4]] to <2 x double>*
-; AVX-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
+; AVX-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP8]], align 8
 ; AVX-NEXT:    ret void
 ;
   %p0 = getelementptr inbounds double, double* %p, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index 22f716e4a36ea..a77c322218b9b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -20,14 +20,14 @@ define double @dot4f64(double* dereferenceable(32) %ptrx, double* dereferenceabl
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[PTRY]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PTRX2]] to <2 x double>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[PTRY2]] to <2 x double>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[PTRX2]] to <2 x double>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[PTRY2]] to <2 x double>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP13]]
@@ -71,14 +71,14 @@ define float @dot4f32(float* dereferenceable(16) %ptrx, float* dereferenceable(1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[PTRY]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PTRX2]] to <2 x float>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[PTRY2]] to <2 x float>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[PTRX2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[PTRY2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP13]]
@@ -202,11 +202,11 @@ define double @dot3f64(double* dereferenceable(32) %ptrx, double* dereferenceabl
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load double, double* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load double, double* [[PTRY]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP6]]
@@ -240,11 +240,11 @@ define float @dot3f32(float* dereferenceable(16) %ptrx, float* dereferenceable(1
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load float, float* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, float* [[PTRY]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP6]]
@@ -278,11 +278,11 @@ define double @dot3f64_fast(double* dereferenceable(32) %ptrx, double* dereferen
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load double, double* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load double, double* [[PTRY]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP6]]
@@ -316,11 +316,11 @@ define float @dot3f32_fast(float* dereferenceable(16) %ptrx, float* dereferencea
 ; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2
 ; CHECK-NEXT:    [[X0:%.*]] = load float, float* [[PTRX]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = load float, float* [[PTRY]], align 4
-; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP6]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index cc736eb807394..38b2b97a23cd0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -10,11 +10,11 @@ define i32 @fn1() {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** @a, align 8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> <i64 11, i64 56>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>*
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8
@@ -41,18 +41,18 @@ define void @fn2(i32* %a, i32* %b, float* %c) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP5]], i32 [[TMP6]])
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret void
@@ -103,11 +103,11 @@ define void @externally_used_ptrs() {
 ; CHECK-LABEL: @externally_used_ptrs(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** @a, align 8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> <i64 56, i64 11>
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
index 8c2e864a6cb21..19898df286711 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
@@ -39,10 +39,10 @@ define void @fabs_2f64() #0 {
 define void @fabs_4f64() #0 {
 ; SSE-LABEL: @fabs_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -70,25 +70,25 @@ define void @fabs_4f64() #0 {
 define void @fabs_8f64() #0 {
 ; SSE-LABEL: @fabs_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP5]])
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP7]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fabs_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP3]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -150,10 +150,10 @@ define void @fabs_4f32() #0 {
 define void @fabs_8f32() #0 {
 ; SSE-LABEL: @fabs_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -193,25 +193,25 @@ define void @fabs_8f32() #0 {
 define void @fabs_16f32() #0 {
 ; SSE-LABEL: @fabs_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP5]])
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fabs_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP3]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
index dc2f382b747ba..c8948e85a4608 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
@@ -46,12 +46,12 @@ define void @fcopysign_2f64() #0 {
 define void @fcopysign_4f64() #0 {
 ; SSE-LABEL: @fcopysign_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -84,31 +84,31 @@ define void @fcopysign_4f64() #0 {
 define void @fcopysign_8f64() #0 {
 ; SSE-LABEL: @fcopysign_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]])
-; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
 ; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fcopysign_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
+; AVX256-NEXT:    store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -184,12 +184,12 @@ define void @fcopysign_4f32() #0 {
 define void @fcopysign_8f32() #0 {
 ; SSE-LABEL: @fcopysign_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -238,31 +238,31 @@ define void @fcopysign_8f32() #0 {
 define void @fcopysign_16f32() #0 {
 ; SSE-LABEL: @fcopysign_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]])
-; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fcopysign_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
+; AVX256-NEXT:    store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fma.ll b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
index 9fa8c55c9fd9c..5c9f93d278533 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
@@ -160,14 +160,14 @@ define void @fma_8f64() #0 {
 ;
 ; FMA256-LABEL: @fma_8f64(
 ; FMA256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]])
-; FMA256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
-; FMA256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[TMP7]])
+; FMA256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
+; FMA256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
+; FMA256-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
 ; FMA256-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; FMA256-NEXT:    ret void
 ;
@@ -458,14 +458,14 @@ define void @fma_16f32() #0 {
 ;
 ; FMA256-LABEL: @fma_16f32(
 ; FMA256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]])
-; FMA256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
-; FMA256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP5]], <8 x float> [[TMP6]], <8 x float> [[TMP7]])
+; FMA256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
+; FMA256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; FMA256-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; FMA256-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; FMA256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
index 187f1a467ad51..8136f2cb2dfec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
@@ -44,12 +44,12 @@ define void @fmaxnum_2f64() #0 {
 define void @fmaxnum_4f64() #0 {
 ; SSE-LABEL: @fmaxnum_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -82,31 +82,31 @@ define void @fmaxnum_4f64() #0 {
 define void @fmaxnum_8f64() #0 {
 ; SSE-LABEL: @fmaxnum_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]])
-; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
 ; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmaxnum_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
+; AVX256-NEXT:    store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -182,12 +182,12 @@ define void @fmaxnum_4f32() #0 {
 define void @fmaxnum_8f32() #0 {
 ; SSE-LABEL: @fmaxnum_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -236,31 +236,31 @@ define void @fmaxnum_8f32() #0 {
 define void @fmaxnum_16f32() #0 {
 ; SSE-LABEL: @fmaxnum_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]])
-; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmaxnum_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
+; AVX256-NEXT:    store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
index 2435832f87a39..470dc8290eee8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
@@ -44,12 +44,12 @@ define void @fminnum_2f64() #0 {
 define void @fminnum_4f64() #0 {
 ; SSE-LABEL: @fminnum_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -82,31 +82,31 @@ define void @fminnum_4f64() #0 {
 define void @fminnum_8f64() #0 {
 ; SSE-LABEL: @fminnum_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
-; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]])
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]])
-; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
 ; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fminnum_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
-; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
+; AVX256-NEXT:    store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -182,12 +182,12 @@ define void @fminnum_4f32() #0 {
 define void @fminnum_8f32() #0 {
 ; SSE-LABEL: @fminnum_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -236,31 +236,31 @@ define void @fminnum_8f32() #0 {
 define void @fminnum_16f32() #0 {
 ; SSE-LABEL: @fminnum_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
-; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]])
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]])
-; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fminnum_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
-; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
+; AVX256-NEXT:    store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
index e192be3955fe4..f7b8087134f38 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
@@ -49,14 +49,14 @@ define void @fmuladd_2f64() #0 {
 define void @fmuladd_4f64() #0 {
 ; SSE-LABEL: @fmuladd_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x double> [[TMP7]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x double> [[TMP6]])
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -94,37 +94,37 @@ define void @fmuladd_4f64() #0 {
 define void @fmuladd_8f64() #0 {
 ; SSE-LABEL: @fmuladd_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
 ; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x double> [[TMP7]])
-; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]])
-; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP14:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP15:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP16:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]])
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]], <2 x double> [[TMP9]])
+; SSE-NEXT:    [[TMP14:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x double> [[TMP10]])
+; SSE-NEXT:    [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]], <2 x double> [[TMP11]])
+; SSE-NEXT:    [[TMP16:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]], <2 x double> [[TMP12]])
+; SSE-NEXT:    store <2 x double> [[TMP13]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP14]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP15]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
 ; SSE-NEXT:    store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmuladd_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]])
-; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[TMP7]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
+; AVX256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
+; AVX256-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
 ; AVX256-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -214,14 +214,14 @@ define void @fmuladd_4f32() #0 {
 define void @fmuladd_8f32() #0 {
 ; SSE-LABEL: @fmuladd_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x float> [[TMP6]])
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -279,37 +279,37 @@ define void @fmuladd_8f32() #0 {
 define void @fmuladd_16f32() #0 {
 ; SSE-LABEL: @fmuladd_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
 ; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]])
-; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]])
-; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP13:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP14:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP15:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP16:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]])
+; SSE-NEXT:    [[TMP12:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]], <4 x float> [[TMP9]])
+; SSE-NEXT:    [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]])
+; SSE-NEXT:    [[TMP15:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]])
+; SSE-NEXT:    [[TMP16:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]], <4 x float> [[TMP12]])
+; SSE-NEXT:    store <4 x float> [[TMP13]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP15]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @fmuladd_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]])
-; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP5]], <8 x float> [[TMP6]], <8 x float> [[TMP7]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
+; AVX256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; AVX256-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX256-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
index aa4cd49e4649e..ed071f90ad76f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@@ -82,10 +82,10 @@ define void @fptosi_8f64_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f64_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -119,10 +119,10 @@ define void @fptosi_8f64_8i64() #0 {
 define void @fptosi_8f64_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -315,10 +315,10 @@ define void @fptosi_8f32_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f32_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -352,10 +352,10 @@ define void @fptosi_8f32_8i64() #0 {
 define void @fptosi_8f32_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
index 3490fa909486e..8d46e40fc5a9e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -82,10 +82,10 @@ define void @fptosi_8f64_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f64_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -119,10 +119,10 @@ define void @fptosi_8f64_8i64() #0 {
 define void @fptosi_8f64_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f64_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -315,10 +315,10 @@ define void @fptosi_8f32_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptosi_8f32_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -352,10 +352,10 @@ define void @fptosi_8f32_8i64() #0 {
 define void @fptosi_8f32_8i32() #0 {
 ; SSE-LABEL: @fptosi_8f32_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
index b3698f260f360..6842b8a0ef1b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -82,10 +82,10 @@ define void @fptoui_8f64_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptoui_8f64_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP3]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -119,10 +119,10 @@ define void @fptoui_8f64_8i64() #0 {
 define void @fptoui_8f64_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f64_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP3]] to <4 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -315,10 +315,10 @@ define void @fptoui_8f32_8i64() #0 {
 ;
 ; AVX256DQ-LABEL: @fptoui_8f32_8i64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64>
-; AVX256DQ-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -352,10 +352,10 @@ define void @fptoui_8f32_8i64() #0 {
 define void @fptoui_8f32_8i32() #0 {
 ; SSE-LABEL: @fptoui_8f32_8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
-; SSE-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fround.ll b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
index 8089f70d342a8..21a36e6227bbe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
@@ -74,10 +74,10 @@ define void @ceil_4f64() #0 {
 ;
 ; SSE41-LABEL: @ceil_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -132,34 +132,34 @@ define void @ceil_8f64() #0 {
 ;
 ; SSE41-LABEL: @ceil_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP5]])
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ceil_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ceil_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -245,10 +245,10 @@ define void @floor_4f64() #0 {
 ;
 ; SSE41-LABEL: @floor_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -303,34 +303,34 @@ define void @floor_8f64() #0 {
 ;
 ; SSE41-LABEL: @floor_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP5]])
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @floor_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @floor_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -416,10 +416,10 @@ define void @nearbyint_4f64() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -474,34 +474,34 @@ define void @nearbyint_8f64() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP5]])
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @nearbyint_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @nearbyint_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -587,10 +587,10 @@ define void @rint_4f64() #0 {
 ;
 ; SSE41-LABEL: @rint_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -645,34 +645,34 @@ define void @rint_8f64() #0 {
 ;
 ; SSE41-LABEL: @rint_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP5]])
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @rint_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @rint_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -758,10 +758,10 @@ define void @trunc_4f64() #0 {
 ;
 ; SSE41-LABEL: @trunc_4f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
@@ -816,34 +816,34 @@ define void @trunc_8f64() #0 {
 ;
 ; SSE41-LABEL: @trunc_8f64(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
-; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])
-; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP5]])
-; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
-; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
 ; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @trunc_8f64(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
-; AVX1-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @trunc_8f64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
-; AVX2-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
 ; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -953,10 +953,10 @@ define void @ceil_8f32() #0 {
 ;
 ; SSE41-LABEL: @ceil_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1047,34 +1047,34 @@ define void @ceil_16f32() #0 {
 ;
 ; SSE41-LABEL: @ceil_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP5]])
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @ceil_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @ceil_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1208,10 +1208,10 @@ define void @floor_8f32() #0 {
 ;
 ; SSE41-LABEL: @floor_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1302,34 +1302,34 @@ define void @floor_16f32() #0 {
 ;
 ; SSE41-LABEL: @floor_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP5]])
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @floor_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @floor_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1463,10 +1463,10 @@ define void @nearbyint_8f32() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1557,34 +1557,34 @@ define void @nearbyint_16f32() #0 {
 ;
 ; SSE41-LABEL: @nearbyint_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP5]])
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @nearbyint_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @nearbyint_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1718,10 +1718,10 @@ define void @rint_8f32() #0 {
 ;
 ; SSE41-LABEL: @rint_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -1812,34 +1812,34 @@ define void @rint_16f32() #0 {
 ;
 ; SSE41-LABEL: @rint_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP5]])
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @rint_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @rint_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;
@@ -1973,10 +1973,10 @@ define void @trunc_8f32() #0 {
 ;
 ; SSE41-LABEL: @trunc_8f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
@@ -2067,34 +2067,34 @@ define void @trunc_16f32() #0 {
 ;
 ; SSE41-LABEL: @trunc_16f32(
 ; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
-; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])
-; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP5]])
-; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP7]])
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE41-NEXT:    ret void
 ;
 ; AVX1-LABEL: @trunc_16f32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
-; AVX1-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP3]])
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX1-NEXT:    ret void
 ;
 ; AVX2-LABEL: @trunc_16f32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
-; AVX2-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX2-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
index c1a4b88b015bf..ae24e92d9e515 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
@@ -13,14 +13,14 @@ define void @test1(double* %a, double* %b, double* %c) #0 personality i32 (...)*
 ; CHECK:       catch:
 ; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] [i8* null, i32 64, i8* null]
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP6]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; CHECK-NEXT:    catchret from [[TMP1]] to label [[TRY_CONT:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
index d9e1a25c76e47..f04fff807379d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
@@ -14,12 +14,12 @@ define void @foo1 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> <i64 16, i64 16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 16, i64 16>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
-; CHECK-NEXT:    store <2 x i32*> [[TMP7]], <2 x i32*>* [[TMP8]], align 8
+; CHECK-NEXT:    store <2 x i32*> [[TMP6]], <2 x i32*>* [[TMP8]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %y, i64 0, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 2bfd90c09fe0d..58e40177f54e2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -101,11 +101,11 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
-; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; CHECK-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
@@ -117,11 +117,11 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
-; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
 ; THRESHOLD-NEXT:    [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 49b55933d989e..eb317de222f2a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -110,20 +110,20 @@ define i32 @maxi8_store_in(i32) {
 ; SSE-NEXT:    ret i32 [[TMP23]]
 ;
 ; AVX-LABEL: @maxi8_store_in(
-; AVX-NEXT:    store i32 0, i32* @var, align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; AVX-NEXT:    store i32 0, i32* @var, align 8
 ; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; AVX-NEXT:    ret i32 [[TMP3]]
 ;
 ; AVX2-LABEL: @maxi8_store_in(
-; AVX2-NEXT:    store i32 0, i32* @var, align 8
 ; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; AVX2-NEXT:    store i32 0, i32* @var, align 8
 ; AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; AVX2-NEXT:    ret i32 [[TMP3]]
 ;
 ; THRESH-LABEL: @maxi8_store_in(
-; THRESH-NEXT:    store i32 0, i32* @var, align 8
 ; THRESH-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; THRESH-NEXT:    store i32 0, i32* @var, align 8
 ; THRESH-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
 ; THRESH-NEXT:    ret i32 [[TMP3]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
index 8f19a93258611..fd1544634d9dd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -782,10 +782,10 @@ define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i
 ; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
 ; STORE-NEXT:    [[ADD1135:%.*]] = or i64 [[MUL]], 2
 ; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
-; STORE-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
-; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
 ; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>*
 ; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; STORE-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
+; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
 ; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
 ; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
index b0fbc6f21433a..7be473214244d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -51,16 +51,15 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
 ; SSE-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
 ; SSE-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
-; SSE-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
-; SSE-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
-; SSE-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
-; SSE-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
 ; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[C_ADDR_0352]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; SSE-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
 ; SSE-NEXT:    [[TMP6:%.*]] = bitcast i8* [[D_ADDR_0353]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1
+; SSE-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
 ; SSE-NEXT:    [[TMP8:%.*]] = bitcast i8* [[A_ADDR_0355]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1
+; SSE-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
 ; SSE-NEXT:    [[TMP10:%.*]] = bitcast i8* [[B_ADDR_0351]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = icmp ult <4 x i8> [[TMP5]], [[TMP7]]
@@ -68,6 +67,7 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32>
 ; SSE-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[SHUFFLE]]
 ; SSE-NEXT:    [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
 ; SSE-NEXT:    [[TMP17:%.*]] = bitcast i8* [[E_ADDR_0354]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP16]], <4 x i8>* [[TMP17]], align 1
 ; SSE-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
@@ -86,16 +86,15 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
 ; SSE-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
 ; SSE-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
-; SSE-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
-; SSE-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
-; SSE-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
-; SSE-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
 ; SSE-NEXT:    [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX45]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1
+; SSE-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
 ; SSE-NEXT:    [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX47]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1
+; SSE-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
 ; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX49]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
+; SSE-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
 ; SSE-NEXT:    [[TMP24:%.*]] = bitcast i8* [[ARRAYIDX52]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1
 ; SSE-NEXT:    [[TMP26:%.*]] = icmp ult <4 x i8> [[TMP19]], [[TMP21]]
@@ -103,6 +102,7 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
 ; SSE-NEXT:    [[TMP29:%.*]] = mul <4 x i32> [[TMP28]], [[SHUFFLE1]]
 ; SSE-NEXT:    [[TMP30:%.*]] = trunc <4 x i32> [[TMP29]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
 ; SSE-NEXT:    [[TMP31:%.*]] = bitcast i8* [[ARRAYIDX56]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP30]], <4 x i8>* [[TMP31]], align 1
 ; SSE-NEXT:    [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
@@ -121,16 +121,15 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
 ; SSE-NEXT:    [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
 ; SSE-NEXT:    [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
-; SSE-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
-; SSE-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
-; SSE-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
-; SSE-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
 ; SSE-NEXT:    [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX93]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1
+; SSE-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
 ; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX95]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
+; SSE-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
 ; SSE-NEXT:    [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX97]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1
+; SSE-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
 ; SSE-NEXT:    [[TMP38:%.*]] = bitcast i8* [[ARRAYIDX100]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP39:%.*]] = load <4 x i8>, <4 x i8>* [[TMP38]], align 1
 ; SSE-NEXT:    [[TMP40:%.*]] = icmp ult <4 x i8> [[TMP33]], [[TMP35]]
@@ -138,6 +137,7 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP42:%.*]] = zext <4 x i8> [[TMP41]] to <4 x i32>
 ; SSE-NEXT:    [[TMP43:%.*]] = mul <4 x i32> [[TMP42]], [[SHUFFLE2]]
 ; SSE-NEXT:    [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
 ; SSE-NEXT:    [[TMP45:%.*]] = bitcast i8* [[ARRAYIDX104]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP44]], <4 x i8>* [[TMP45]], align 1
 ; SSE-NEXT:    [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
@@ -156,16 +156,15 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
 ; SSE-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
 ; SSE-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
-; SSE-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
-; SSE-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
-; SSE-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
-; SSE-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; SSE-NEXT:    [[TMP46:%.*]] = bitcast i8* [[ARRAYIDX141]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP47:%.*]] = load <4 x i8>, <4 x i8>* [[TMP46]], align 1
+; SSE-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
 ; SSE-NEXT:    [[TMP48:%.*]] = bitcast i8* [[ARRAYIDX143]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP49:%.*]] = load <4 x i8>, <4 x i8>* [[TMP48]], align 1
+; SSE-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
 ; SSE-NEXT:    [[TMP50:%.*]] = bitcast i8* [[ARRAYIDX145]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP51:%.*]] = load <4 x i8>, <4 x i8>* [[TMP50]], align 1
+; SSE-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
 ; SSE-NEXT:    [[TMP52:%.*]] = bitcast i8* [[ARRAYIDX148]] to <4 x i8>*
 ; SSE-NEXT:    [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1
 ; SSE-NEXT:    [[TMP54:%.*]] = icmp ult <4 x i8> [[TMP47]], [[TMP49]]
@@ -173,6 +172,7 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; SSE-NEXT:    [[TMP56:%.*]] = zext <4 x i8> [[TMP55]] to <4 x i32>
 ; SSE-NEXT:    [[TMP57:%.*]] = mul <4 x i32> [[TMP56]], [[SHUFFLE3]]
 ; SSE-NEXT:    [[TMP58:%.*]] = trunc <4 x i32> [[TMP57]] to <4 x i8>
+; SSE-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; SSE-NEXT:    [[TMP59:%.*]] = bitcast i8* [[ARRAYIDX152]] to <4 x i8>*
 ; SSE-NEXT:    store <4 x i8> [[TMP58]], <4 x i8>* [[TMP59]], align 1
 ; SSE-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
@@ -269,16 +269,15 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; AVX512-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
 ; AVX512-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
 ; AVX512-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
-; AVX512-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
-; AVX512-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
-; AVX512-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
-; AVX512-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
 ; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
+; AVX512-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
 ; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1
+; AVX512-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
 ; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
 ; AVX512-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
 ; AVX512-NEXT:    [[TMP9:%.*]] = icmp ult <16 x i8> [[TMP2]], [[TMP4]]
@@ -286,6 +285,7 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; AVX512-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
 ; AVX512-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP11]], [[SHUFFLE]]
 ; AVX512-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
+; AVX512-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; AVX512-NEXT:    [[TMP14:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
 ; AVX512-NEXT:    store <16 x i8> [[TMP13]], <16 x i8>* [[TMP14]], align 1
 ; AVX512-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index b97d6018c3ddb..d9b6207997236 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -292,20 +292,20 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x float> [[RD1]]
@@ -451,14 +451,14 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
+; MINTREESIZE-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1
+; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
+; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1
+; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
+; MINTREESIZE-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP11]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 73c4187efa8f0..2b28765d82a04 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -327,20 +327,20 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x float> [[RD1]]
@@ -486,14 +486,14 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; MINTREESIZE-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
 ; MINTREESIZE-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1
-; MINTREESIZE-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1
-; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
-; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1
-; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1
-; MINTREESIZE-NEXT:    [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]]
-; MINTREESIZE-NEXT:    ret <4 x float> [[TMP17]]
+; MINTREESIZE-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]]
+; MINTREESIZE-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; MINTREESIZE-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1
+; MINTREESIZE-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0
+; MINTREESIZE-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1
+; MINTREESIZE-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
+; MINTREESIZE-NEXT:    [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1
+; MINTREESIZE-NEXT:    ret <4 x float> [[TMP11]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
index fd9e826904f52..9e4645e8eb038 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll
@@ -9,12 +9,12 @@ define { <2 x float>, <2 x float> } @foo(%struct.sw* %v) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* undef, align 4
 ; CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_SW:%.*]], %struct.sw* [[V:%.*]], i64 0, i32 0
 ; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_SW]], %struct.sw* [[V]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* undef, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[X]] to <2 x float>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 16
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[X]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* undef, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], poison

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
index ef7e827cd62a7..ed58f407628cd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
@@ -7,14 +7,14 @@ define void @julia_2xdouble([2 x double]* sret([2 x double]), [2 x double]*, [2
 ; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1
-; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
-; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
@@ -58,16 +58,16 @@ define void @julia_4xfloat([4 x float]* sret([4 x float]), [4 x float]*, [4 x fl
 ; CHECK-NEXT:    [[PX2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 2
 ; CHECK-NEXT:    [[PY2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 2
 ; CHECK-NEXT:    [[PX3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 3
-; CHECK-NEXT:    [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3
-; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1
-; CHECK-NEXT:    [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2
-; CHECK-NEXT:    [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2
+; CHECK-NEXT:    [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
index c087239f37b2e..bde72f647604e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll
@@ -8,9 +8,9 @@ define void @inst_size(i64* %a, <2 x i64> %b) {
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
 ; CHECK-NEXT:    [[PTR4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 3
-; CHECK-NEXT:    [[T41:%.*]] = icmp sgt i64 0, [[VAL]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 4
+; CHECK-NEXT:    [[T41:%.*]] = icmp sgt i64 0, [[VAL]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    br label [[BLOCK:%.*]]
 ; CHECK:       block:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
index 7b2b50de7ca46..950bbcb7d5dd6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll
@@ -8,12 +8,12 @@ define void @vec_powi_f32(float* %a, float* %c, i32 %P) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]])
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
index 6a2abcea80b4c..820d85eb309d1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
@@ -22,16 +22,16 @@ define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
@@ -78,16 +78,16 @@ define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
index 409ebe579a7e5..11e313bdbe6fb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -9,20 +9,20 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4
@@ -67,10 +67,6 @@ define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias noca
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
@@ -82,6 +78,10 @@ define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias noca
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
index 6721b046c504c..33ad7c4c4ec0f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll
@@ -17,36 +17,36 @@ define dso_local void @j() local_unnamed_addr {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 1.000000e+01, float 1.000000e+01>
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP7]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    store float [[TMP9]], float* @g, align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
+; CHECK-NEXT:    store float [[TMP11]], float* @c, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
+; CHECK-NEXT:    store float [[TMP12]], float* @d, align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
+; CHECK-NEXT:    store float [[TMP13]], float* @e, align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; CHECK-NEXT:    store float [[TMP14]], float* @f, align 4
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14
 ; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* @a, align 4
-; CHECK-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP1]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], <float 1.000000e+01, float 1.000000e+01>
-; CHECK-NEXT:    [[TMP9:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP8]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    store float [[TMP10]], float* @g, align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[SHUFFLE]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
-; CHECK-NEXT:    store float [[TMP12]], float* @c, align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    store float [[TMP13]], float* @d, align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
-; CHECK-NEXT:    store float [[TMP14]], float* @e, align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    store float [[TMP15]], float* @f, align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> <float poison, float -1.000000e+00, float poison, float -1.000000e+00>, float [[CONV19]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = fsub <4 x float> [[TMP11]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd <4 x float> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index c6e3b1088ef25..3d68b9b8fa065 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -148,17 +148,17 @@ define void @PR43578_prefer128(i32* %r, i64* %p, i64* %q) #0 {
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index c9ec8c8df6b2d..2683ec1e1d722 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -148,17 +148,17 @@ define void @PR43578_prefer128(i32* %r, i64* %p, i64* %q) #0 {
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[P2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index df0ad7d02c681..ba3bd26d38610 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -237,8 +237,6 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
@@ -256,6 +254,8 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -329,8 +329,6 @@ define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, do
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
@@ -348,6 +346,8 @@ define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, do
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
@@ -416,8 +416,6 @@ define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
 ; CHECK-LABEL: @lookahead_crash(
 ; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
-; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
-; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]])
@@ -425,6 +423,8 @@ define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
+; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
@@ -457,8 +457,6 @@ define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x doubl
 ; CHECK-NEXT:    [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4
 ; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
 ; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
-; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
-; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
@@ -466,6 +464,8 @@ define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x doubl
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
+; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
 ; CHECK-NEXT:    ret void
@@ -591,6 +591,8 @@ define void @ChecksExtractScores_
diff erent_vectors(double* %storeArray, double*
 ; CHECK-LABEL: @ChecksExtractScores_
diff erent_vectors(
 ; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
 ; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
 ; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
 ; CHECK-NEXT:    [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
@@ -599,10 +601,6 @@ define void @ChecksExtractScores_
diff erent_vectors(double* %storeArray, double*
 ; CHECK-NEXT:    [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4
 ; CHECK-NEXT:    [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0
 ; CHECK-NEXT:    [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
-; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
-; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
@@ -611,6 +609,8 @@ define void @ChecksExtractScores_
diff erent_vectors(double* %storeArray, double*
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]]
+; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
+; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
index f043e34542de0..5bfd4f0f5fcf7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
@@ -34,12 +34,12 @@ entry:
 define void @test2(double* %a, double* %b, i8* %e) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]], !fpmath !5
+; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
index f39abd1cead22..1b224cb4109c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
@@ -28,11 +28,11 @@ define i32 @bar(double* nocapture %A, i32 %d) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 (...) @foo()
 ; CHECK-NEXT:    br label [[TMP7]]
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
-; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP9]] to <2 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 9.000000e+00, double 5.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP9]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
index 9616940c1982d..17cd58d410c3d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -9,16 +9,16 @@ target triple = "i386-apple-macosx10.9.0"
 define void @test(double* %i1, double* %i2, double* %o) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr double, double* [[I1:%.*]], i64 1
-; CHECK-NEXT:    [[I1_0:%.*]] = load double, double* [[I1]], align 16
+; CHECK-NEXT:    [[I1_0:%.*]] = load double, double* [[I1:%.*]], align 16
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr double, double* [[I1]], i64 1
 ; CHECK-NEXT:    [[I1_1:%.*]] = load double, double* [[I1_GEP1]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1_0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1
 ; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds double, double* [[I2:%.*]], i64 0
-; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1
 ; CHECK-NEXT:    [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16
+; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1
 ; CHECK-NEXT:    [[I2_1:%.*]] = load double, double* [[I2_GEP1]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I2_0]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I2_1]], i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
index 0a3e501ef14c9..a7abe86c93fb8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -13,14 +13,14 @@ define void @powof2div_uniform(i32* noalias nocapture %a, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -101,14 +101,14 @@ define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapt
 ; AVX-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; AVX-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; AVX-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; AVX-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; AVX-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; AVX-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; AVX-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; AVX-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; AVX-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 4, i32 8, i32 16>
+; AVX-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; AVX-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; AVX-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; AVX-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll
index dd83351dbd29f..385d323801a99 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll
@@ -13,14 +13,14 @@ define void @powof2mul_uniform(i32* noalias nocapture %a, i32* noalias nocapture
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -68,14 +68,14 @@ define void @negpowof2mul_uniform(i32* noalias nocapture %a, i32* noalias nocapt
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 -2, i32 -2, i32 -2, i32 -2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -123,14 +123,14 @@ define void @powof2mul_nonuniform(i32* noalias nocapture %a, i32* noalias nocapt
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 2, i32 4, i32 8, i32 16>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -178,14 +178,14 @@ define void @negpowof2mul_nonuniform(i32* noalias nocapture %a, i32* noalias noc
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], <i32 -2, i32 -4, i32 -8, i32 -16>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
@@ -278,16 +278,16 @@ define void @PR51436(i64* nocapture %a) {
 ; AVX-NEXT:    [[GEP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 7
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
-; AVX-NEXT:    [[TMP2:%.*]] = mul <4 x i64> [[TMP1]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
-; AVX-NEXT:    [[TMP4:%.*]] = bitcast i64* [[A]] to <4 x i64>*
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* [[TMP4]], align 8
-; AVX-NEXT:    [[TMP5:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>*
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8
-; AVX-NEXT:    [[TMP7:%.*]] = mul <4 x i64> [[TMP6]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
-; AVX-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[TMP7]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP2:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>*
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; AVX-NEXT:    [[TMP4:%.*]] = mul <4 x i64> [[TMP1]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>
+; AVX-NEXT:    [[TMP8:%.*]] = bitcast i64* [[A]] to <4 x i64>*
+; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* [[TMP8]], align 8
 ; AVX-NEXT:    [[TMP9:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>*
-; AVX-NEXT:    store <4 x i64> [[TMP8]], <4 x i64>* [[TMP9]], align 8
+; AVX-NEXT:    store <4 x i64> [[TMP7]], <4 x i64>* [[TMP9]], align 8
 ; AVX-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index f3cdf5650b12c..f36d8cbe1f3f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -35,14 +35,14 @@ define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
 ; AVX-NEXT:    store i64 [[OR_1]], i64* undef, align 8
 ; AVX-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
 ; AVX-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
-; AVX-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
-; AVX-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
-; AVX-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
 ; AVX-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
 ; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; AVX-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0
 ; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
 ; AVX-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; AVX-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; AVX-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
 ; AVX-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
 ; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
 ; AVX-NEXT:    ret void
@@ -72,22 +72,22 @@ define void @pr35497() local_unnamed_addr #0 {
 ; SSE-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; SSE-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
-; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
-; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
 ; SSE-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
 ; SSE-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
+; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
 ; SSE-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
 ; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; SSE-NEXT:    [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
-; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
-; SSE-NEXT:    [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; SSE-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
-; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
 ; SSE-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; SSE-NEXT:    ret void
@@ -98,22 +98,22 @@ define void @pr35497() local_unnamed_addr #0 {
 ; AVX-NEXT:    [[ADD:%.*]] = add i64 undef, undef
 ; AVX-NEXT:    store i64 [[ADD]], i64* undef, align 1
 ; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
-; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
-; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
-; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 undef, i64 poison>, i64 [[TMP0]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
+; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
 ; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
 ; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
-; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1
-; AVX-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], <i64 2, i64 2>
-; AVX-NEXT:    [[TMP10:%.*]] = and <2 x i64> [[TMP9]], <i64 20, i64 20>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; AVX-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; AVX-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
 ; AVX-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
-; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]]
+; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
 ; AVX-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
 ; AVX-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
 ; AVX-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index b7741a5edf59e..f6dd7526e6e76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -68,16 +68,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
 ; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -86,16 +86,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
 ; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -200,28 +200,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
 ; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -229,28 +229,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
-; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
-; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
-; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
-; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
-; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
-; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
+; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
+; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
+; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
+; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
+; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
 ; AVX2-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX2-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -303,19 +303,19 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
+; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
 ; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 2
 ; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
 ; AVX512VL-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 3
@@ -511,26 +511,26 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 ;
 ; AVX512VL-LABEL: @gather_load_4(
 ; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
 ; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 9
+; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
 ; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; AVX512VL-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; AVX512VL-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; AVX512VL-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
 ; AVX512VL-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
 ; AVX512VL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
@@ -586,109 +586,109 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 
 define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2
-; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3
-; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
+; SSE-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3
+; SSE-NEXT:    [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]]
+; SSE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; SSE-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2
+; SSE-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; SSE-NEXT:    [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0
+; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2
 ; SSE-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3
 ; SSE-NEXT:    [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]]
-; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
+; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>*
 ; SSE-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
+; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
+; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
+; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
+; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
+; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
 ; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
@@ -696,52 +696,52 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
+; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
+; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
+; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
+; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
+; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
 ; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX2-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 2d8bb0f4e0245..fd1c612a0696e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -68,16 +68,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX-LABEL: @gather_load_2(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
 ; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -86,16 +86,16 @@ define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture rea
 ;
 ; AVX2-LABEL: @gather_load_2(
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
 ; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
@@ -200,28 +200,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_3(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
 ; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -229,28 +229,28 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_3(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
-; AVX2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
-; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
-; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
-; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
-; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
-; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
-; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
+; AVX2-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
+; AVX2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
+; AVX2-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
+; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
+; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
+; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
+; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
 ; AVX2-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
 ; AVX2-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
@@ -303,19 +303,19 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
 ; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512VL-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
+; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
 ; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; AVX512VL-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 2
 ; AVX512VL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
-; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
 ; AVX512VL-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 3
@@ -511,26 +511,26 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 ;
 ; AVX512VL-LABEL: @gather_load_4(
 ; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
 ; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
-; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 9
+; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
 ; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
 ; AVX512VL-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
 ; AVX512VL-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
 ; AVX512VL-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
 ; AVX512VL-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
 ; AVX512VL-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
 ; AVX512VL-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
 ; AVX512VL-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
 ; AVX512VL-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
@@ -586,109 +586,109 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 
 define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
-; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2
-; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3
-; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
+; SSE-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1
+; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2
+; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
+; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1
+; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2
+; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3
+; SSE-NEXT:    [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]]
+; SSE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; SSE-NEXT:    store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; SSE-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2
+; SSE-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; SSE-NEXT:    [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2
+; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3
+; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0
+; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1
+; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2
 ; SSE-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3
 ; SSE-NEXT:    [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]]
-; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
+; SSE-NEXT:    [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>*
 ; SSE-NEXT:    store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
+; AVX-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
+; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
+; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
+; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
+; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
+; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
 ; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
@@ -696,52 +696,52 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
-; AVX2-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
+; AVX2-NEXT:    [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
+; AVX2-NEXT:    [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
+; AVX2-NEXT:    [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
+; AVX2-NEXT:    [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
+; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
+; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
+; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
+; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
+; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
 ; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
 ; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
 ; AVX2-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
index 6fc3921f5f8dc..149303c4bdc46 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -20,22 +20,22 @@ define i32 @foo(i32* %
diff ) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], 5
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]]
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP1]], 6
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]]
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP1]], 7
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]]
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP1]], 7
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
index a08ae378fd173..5b00b2e044a57 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll
@@ -7,33 +7,33 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* undef, i64 5
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* undef, i64 6
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* undef, i64 7
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw <4 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw <4 x i32> [[TMP14]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP25]]
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i32> [[TMP10]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> [[TMP26]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP27]], <4 x i32>* [[TMP29]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr inbounds i8, i8* undef, i64 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index f2f12939894f2..d3da0c9572028 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -9,6 +9,8 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
 ; CHECK:       if.then22.i:
+; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
+; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
@@ -17,23 +19,20 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
-; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
-; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
-; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[SHUFFLE1]], <i32 9, i32 10, i32 11, i32 12>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
 ; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[CONV31_I]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], <i32 14, i32 15>
@@ -47,6 +46,7 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i8> [[TMP16]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 ; CHECK-NEXT:    store <16 x i8> [[TMP17]], <16 x i8>* [[TMP18]], align 1
 ; CHECK-NEXT:    unreachable

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/return.ll b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
index 6eb1bff85ada0..4f7448f371028 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/return.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
@@ -44,9 +44,9 @@ define double @return2(double* nocapture readonly %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
index 5f08d13cbec21..769c752ecb067 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
@@ -16,12 +16,12 @@ define i32 @foo(i32 %0, i32* %1, float* %2)  {
 ; CHECK-NEXT:    br label [[T37:%.*]]
 ; CHECK:       t37:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP6]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 0
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 1
 ; CHECK-NEXT:    [[T31:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 2
 ; CHECK-NEXT:    [[T33:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 3
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP6]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[T21]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[SHUFFLE]], <4 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[T88:%.*]] = bitcast float* [[T9]] to <2 x float>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
index 3e4cfe6e05157..aabd7260d4aae 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
@@ -53,11 +53,11 @@ define void @test(float * %a, float * %b, float * %c, float * %d) {
 ; CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[C2:%.*]] = getelementptr inbounds float, float* [[C]], i64 2
 ; CHECK-NEXT:    [[C3:%.*]] = getelementptr inbounds float, float* [[C]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 1
 ; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds float, float* [[D]], i64 2
 ; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds float, float* [[D]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[D]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
index a95f658eb411d..875fcbc52e6a8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -19,22 +19,22 @@ define i32 @foo(i32* nocapture readonly %
diff ) #0 {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], 5
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]]
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP1]], 6
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]]
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP1]], 7
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]]
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP1]], 7
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
index 5d41283995606..d3aa91a0e76c9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -93,12 +93,12 @@ define void @ashr_v8i64() {
 ;
 ; AVX2-LABEL: @ashr_v8i64(
 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX2-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX2-NEXT:    ret void
 ;
@@ -111,12 +111,12 @@ define void @ashr_v8i64() {
 ;
 ; XOP-LABEL: @ashr_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP3:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; XOP-NEXT:    ret void
 ;
@@ -158,31 +158,31 @@ define void @ashr_v8i64() {
 define void @ashr_v16i32() {
 ; SSE-LABEL: @ashr_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ashr_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -195,12 +195,12 @@ define void @ashr_v16i32() {
 ;
 ; XOP-LABEL: @ashr_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP3:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; XOP-NEXT:    ret void
 ;
@@ -274,31 +274,31 @@ define void @ashr_v16i32() {
 define void @ashr_v32i16() {
 ; SSE-LABEL: @ashr_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = ashr <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = ashr <8 x i16> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = ashr <8 x i16> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = ashr <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = ashr <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = ashr <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ashr_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -311,12 +311,12 @@ define void @ashr_v32i16() {
 ;
 ; XOP-LABEL: @ashr_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP3:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
@@ -454,31 +454,31 @@ define void @ashr_v32i16() {
 define void @ashr_v64i8() {
 ; SSE-LABEL: @ashr_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @ashr_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -491,12 +491,12 @@ define void @ashr_v64i8() {
 ;
 ; XOP-LABEL: @ashr_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
index d86fd028d5648..ba373f8547af4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -23,31 +23,31 @@
 define void @lshr_v8i64() {
 ; SSE-LABEL: @lshr_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -60,12 +60,12 @@ define void @lshr_v8i64() {
 ;
 ; XOP-LABEL: @lshr_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP3:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; XOP-NEXT:    ret void
 ;
@@ -107,31 +107,31 @@ define void @lshr_v8i64() {
 define void @lshr_v16i32() {
 ; SSE-LABEL: @lshr_v16i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
 ; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -144,12 +144,12 @@ define void @lshr_v16i32() {
 ;
 ; XOP-LABEL: @lshr_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP3:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; XOP-NEXT:    ret void
 ;
@@ -223,31 +223,31 @@ define void @lshr_v16i32() {
 define void @lshr_v32i16() {
 ; SSE-LABEL: @lshr_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = lshr <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <8 x i16> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <8 x i16> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -260,12 +260,12 @@ define void @lshr_v32i16() {
 ;
 ; XOP-LABEL: @lshr_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP3:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
@@ -403,31 +403,31 @@ define void @lshr_v32i16() {
 define void @lshr_v64i8() {
 ; SSE-LABEL: @lshr_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = lshr <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @lshr_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -440,12 +440,12 @@ define void @lshr_v64i8() {
 ;
 ; XOP-LABEL: @lshr_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
index 8ebe430f1782d..4ca73963881f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -23,31 +23,31 @@
 define void @shl_v8i64() {
 ; SSE-LABEL: @shl_v8i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = shl <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v8i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP3:%.*]] = shl <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; AVX-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; AVX-NEXT:    ret void
 ;
@@ -60,12 +60,12 @@ define void @shl_v8i64() {
 ;
 ; XOP-LABEL: @shl_v8i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP3:%.*]] = shl <4 x i64> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
-; XOP-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
 ; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
 ; XOP-NEXT:    ret void
 ;
@@ -174,12 +174,12 @@ define void @shl_v16i32() {
 ;
 ; AVX-LABEL: @shl_v16i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP3:%.*]] = shl <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; AVX-NEXT:    ret void
 ;
@@ -192,12 +192,12 @@ define void @shl_v16i32() {
 ;
 ; XOP-LABEL: @shl_v16i32(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP3:%.*]] = shl <8 x i32> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
-; XOP-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
 ; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
 ; XOP-NEXT:    ret void
 ;
@@ -271,31 +271,31 @@ define void @shl_v16i32() {
 define void @shl_v32i16() {
 ; SSE-LABEL: @shl_v32i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP3:%.*]] = shl <8 x i16> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP6:%.*]] = shl <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP9:%.*]] = shl <8 x i16> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
-; SSE-NEXT:    [[TMP12:%.*]] = shl <8 x i16> [[TMP10]], [[TMP11]]
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = shl <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
 ; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP3:%.*]] = shl <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; AVX-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; AVX-NEXT:    ret void
 ;
@@ -308,12 +308,12 @@ define void @shl_v32i16() {
 ;
 ; XOP-LABEL: @shl_v32i16(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP3:%.*]] = shl <16 x i16> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
-; XOP-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
 ; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
 ; XOP-NEXT:    ret void
 ;
@@ -451,31 +451,31 @@ define void @shl_v32i16() {
 define void @shl_v64i8() {
 ; SSE-LABEL: @shl_v64i8(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP6:%.*]] = shl <16 x i8> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP7]], [[TMP8]]
-; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = shl <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; SSE-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; SSE-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @shl_v64i8(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; AVX-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; AVX-NEXT:    [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; AVX-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; AVX-NEXT:    ret void
 ;
@@ -488,12 +488,12 @@ define void @shl_v64i8() {
 ;
 ; XOP-LABEL: @shl_v64i8(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[TMP1]], [[TMP2]]
-; XOP-NEXT:    store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
-; XOP-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP4]], [[TMP5]]
+; XOP-NEXT:    [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1
+; XOP-NEXT:    [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1
 ; XOP-NEXT:    store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1
 ; XOP-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
index ceb9f4bcf99fc..86d728cb7c4bd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -6,19 +6,19 @@ define void @wombat(i32* %ptr, i32* %ptr1) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3
 ; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], poison
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> poison, <4 x i32> [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> poison, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
index 9349d5d95be7e..72b1ff2609d64 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
@@ -12,13 +12,13 @@ define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i6
 ; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP10:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP6]], <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 7, i32 14, i32 21, i32 28>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], <i32 7, i32 14, i32 21, i32 28>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10]] = add i64 [[I_019]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP10]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
index 120f80547ef57..2a18d50b38933 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -33,12 +33,12 @@ define void @test1(double* %a, double* %b, double* %c) {
 ; Simple 3-pair chain with loads and stores, obfuscated with bitcasts
 define void @test2(double* %a, double* %b, i8* %e) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
 ; CHECK-NEXT:    ret void
@@ -62,11 +62,11 @@ define void @test2(double* %a, double* %b, i8* %e) {
 ; Don't vectorize volatile loads.
 define void @test_volatile_load(double* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test_volatile_load(
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A]], align 8
-; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B]], align 8
+; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
 ; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
 ; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
index 72eb65e45fb7c..0188dd74f7a00 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -181,10 +181,10 @@ define void @sitofp_8i64_8f64() #0 {
 ;
 ; AVX256DQ-LABEL: @sitofp_8i64_8f64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x double>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -234,10 +234,10 @@ define void @sitofp_2i32_2f64() #0 {
 define void @sitofp_4i32_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i32_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -265,25 +265,25 @@ define void @sitofp_4i32_4f64() #0 {
 define void @sitofp_8i32_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -339,10 +339,10 @@ define void @sitofp_2i16_2f64() #0 {
 define void @sitofp_4i16_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -370,25 +370,25 @@ define void @sitofp_4i16_4f64() #0 {
 define void @sitofp_8i16_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -444,10 +444,10 @@ define void @sitofp_2i8_2f64() #0 {
 define void @sitofp_4i8_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i8_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -475,25 +475,25 @@ define void @sitofp_4i8_4f64() #0 {
 define void @sitofp_8i8_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -578,10 +578,10 @@ define void @sitofp_4i64_4f32() #0 {
 define void @sitofp_8i64_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i64_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -643,10 +643,10 @@ define void @sitofp_4i32_4f32() #0 {
 define void @sitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -686,25 +686,25 @@ define void @sitofp_8i32_8f32() #0 {
 define void @sitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i32_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -790,10 +790,10 @@ define void @sitofp_4i16_4f32() #0 {
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -833,25 +833,25 @@ define void @sitofp_8i16_8f32() #0 {
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -937,10 +937,10 @@ define void @sitofp_4i8_4f32() #0 {
 define void @sitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -980,25 +980,25 @@ define void @sitofp_8i8_8f32() #0 {
 define void @sitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i8_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 9f31fc2f3dc19..00e077b663b94 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -181,10 +181,10 @@ define void @sitofp_8i64_8f64() #0 {
 ;
 ; AVX256DQ-LABEL: @sitofp_8i64_8f64(
 ; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256DQ-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x double>
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256DQ-NEXT:    ret void
 ;
@@ -234,10 +234,10 @@ define void @sitofp_2i32_2f64() #0 {
 define void @sitofp_4i32_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i32_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -265,25 +265,25 @@ define void @sitofp_4i32_4f64() #0 {
 define void @sitofp_8i32_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -339,10 +339,10 @@ define void @sitofp_2i16_2f64() #0 {
 define void @sitofp_4i16_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i16_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -370,25 +370,25 @@ define void @sitofp_4i16_4f64() #0 {
 define void @sitofp_8i16_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -444,10 +444,10 @@ define void @sitofp_2i8_2f64() #0 {
 define void @sitofp_4i8_4f64() #0 {
 ; SSE-LABEL: @sitofp_4i8_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -475,25 +475,25 @@ define void @sitofp_4i8_4f64() #0 {
 define void @sitofp_8i8_8f64() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -578,10 +578,10 @@ define void @sitofp_4i64_4f32() #0 {
 define void @sitofp_8i64_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i64_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -643,10 +643,10 @@ define void @sitofp_4i32_4f32() #0 {
 define void @sitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i32_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -686,25 +686,25 @@ define void @sitofp_8i32_8f32() #0 {
 define void @sitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i32_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -790,10 +790,10 @@ define void @sitofp_4i16_4f32() #0 {
 define void @sitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i16_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -833,25 +833,25 @@ define void @sitofp_8i16_8f32() #0 {
 define void @sitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i16_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -937,10 +937,10 @@ define void @sitofp_4i8_4f32() #0 {
 define void @sitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @sitofp_8i8_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -980,25 +980,25 @@ define void @sitofp_8i8_8f32() #0 {
 define void @sitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @sitofp_16i8_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index bd1d24e1b45c8..148dbabdee715 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -7,28 +7,27 @@ define dso_local void @_Z4testP1S(%struct.S* %p) local_unnamed_addr {
 ; CHECK-LABEL: @_Z4testP1S(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 1, i64 0
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 15
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 7
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 3
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
 ; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 4
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 12
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 5
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 13
 ; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 6
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14
 ; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 7
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 15
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 7
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 6
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 4
-; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 12
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 13
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 5
@@ -42,6 +41,7 @@ define dso_local void @_Z4testP1S(%struct.S* %p) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32*> [[TMP8]], i32* [[ARRAYIDX48]], i32 7
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret void
@@ -128,15 +128,15 @@ define dso_local void @test_unordered_splits(%struct.S* nocapture %p) local_unna
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 0, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
 ; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -199,21 +199,21 @@ define dso_local void @test_cost_splits(%struct.S* nocapture %p) local_unnamed_a
 ; CHECK-NEXT:    [[G22:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 14
 ; CHECK-NEXT:    [[G23:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 15
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
-; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
-; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
-; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[G10]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[G22]] to <2 x i32>*
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
index b25275ee1c879..c8baf37637b4a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
@@ -37,10 +37,10 @@ define void @sqrt_2f64() #0 {
 define void @sqrt_4f64() #0 {
 ; SSE-LABEL: @sqrt_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
 ; SSE-NEXT:    ret void
 ;
@@ -68,25 +68,25 @@ define void @sqrt_4f64() #0 {
 define void @sqrt_8f64() #0 {
 ; SSE-LABEL: @sqrt_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]])
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP7]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sqrt_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP3]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
 ; AVX256-NEXT:    ret void
 ;
@@ -148,10 +148,10 @@ define void @sqrt_4f32() #0 {
 define void @sqrt_8f32() #0 {
 ; SSE-LABEL: @sqrt_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
@@ -191,25 +191,25 @@ define void @sqrt_8f32() #0 {
 define void @sqrt_16f32() #0 {
 ; SSE-LABEL: @sqrt_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP5]])
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP7]])
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @sqrt_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP3]])
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
index a5ae2a02a35f3..19f654e5a4f87 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -9,19 +9,19 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP6]], align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
index 6a5e698371f04..f773910d8c614 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
@@ -4,26 +4,26 @@
 define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @non-ordered-stores(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
+; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
index b24da6cf95d4f..65d1fce9e1303 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
@@ -90,13 +90,13 @@ define void @store_reverse(i64* %p3) {
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 10
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP5]], align 8
@@ -148,6 +148,9 @@ define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
@@ -164,9 +167,6 @@ define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
@@ -230,6 +230,9 @@ define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
@@ -247,9 +250,6 @@ define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
index e04d9297ac8fb..7232b20eafc49 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -11,9 +11,9 @@ define void @tiny_tree_fully_vectorizable(double* noalias nocapture %dst, double
 ; CHECK-NEXT:    [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[SRC_ADDR_013]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[DST_ADDR_014]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]]
@@ -62,9 +62,9 @@ define void @tiny_tree_fully_vectorizable2(float* noalias nocapture %dst, float*
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC_ADDR_021]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
@@ -165,16 +165,16 @@ define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, fl
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
 ; CHECK-NEXT:    [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
index 69544d9e59827..b38bd8d6f88e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -39,10 +39,10 @@ define void @uitofp_2i64_2f64() #0 {
 define void @uitofp_4i64_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i64_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -70,25 +70,25 @@ define void @uitofp_4i64_4f64() #0 {
 define void @uitofp_8i64_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i64_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i64_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -171,10 +171,10 @@ define void @uitofp_2i32_2f64() #0 {
 define void @uitofp_4i32_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i32_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -202,25 +202,25 @@ define void @uitofp_4i32_4f64() #0 {
 define void @uitofp_8i32_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i32_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i32> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i32> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i32> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i32_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -276,10 +276,10 @@ define void @uitofp_2i16_2f64() #0 {
 define void @uitofp_4i16_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i16_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -307,25 +307,25 @@ define void @uitofp_4i16_4f64() #0 {
 define void @uitofp_8i16_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i16_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i16> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i16> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i16_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -381,10 +381,10 @@ define void @uitofp_2i8_2f64() #0 {
 define void @uitofp_4i8_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i8_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
 ; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -412,25 +412,25 @@ define void @uitofp_4i8_4f64() #0 {
 define void @uitofp_8i8_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i8_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i8> [[TMP5]] to <2 x double>
-; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i8> [[TMP7]] to <2 x double>
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
 ; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i8_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
-; AVX256-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x double>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
 ; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -515,10 +515,10 @@ define void @uitofp_4i64_4f32() #0 {
 define void @uitofp_8i64_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i64_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -580,10 +580,10 @@ define void @uitofp_4i32_4f32() #0 {
 define void @uitofp_8i32_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i32_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -623,25 +623,25 @@ define void @uitofp_8i32_8f32() #0 {
 define void @uitofp_16i32_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i32_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i32_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -727,10 +727,10 @@ define void @uitofp_4i16_4f32() #0 {
 define void @uitofp_8i16_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i16_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -770,25 +770,25 @@ define void @uitofp_8i16_8f32() #0 {
 define void @uitofp_16i16_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i16_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i16_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;
@@ -874,10 +874,10 @@ define void @uitofp_4i8_4f32() #0 {
 define void @uitofp_8i8_8f32() #0 {
 ; SSE-LABEL: @uitofp_8i8_8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
 ; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
@@ -917,25 +917,25 @@ define void @uitofp_8i8_8f32() #0 {
 define void @uitofp_16i8_16f32() #0 {
 ; SSE-LABEL: @uitofp_16i8_16f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
-; SSE-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
-; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
-; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP5]] to <4 x float>
-; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
-; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
 ; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_16i8_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
-; AVX256-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
-; AVX256-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
-; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
-; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP3]] to <8 x float>
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
 ; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
 ; AVX256-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll
index 08840ff36c0a2..95f7430fe7b21 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll
@@ -8,19 +8,19 @@ define void @foo(i8* %c, float* %d) {
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 3
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 -1
-; CHECK-NEXT:    [[ADD_PTR37:%.*]] = getelementptr inbounds float, float* [[D]], i64 -2
-; CHECK-NEXT:    [[ADD_PTR45:%.*]] = getelementptr inbounds float, float* [[D]], i64 -3
-; CHECK-NEXT:    [[ADD_PTR53:%.*]] = getelementptr inbounds float, float* [[D]], i64 -4
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX4]] to <4 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 1, i32 2, i32 7, i32 0>
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 -1
+; CHECK-NEXT:    [[ADD_PTR37:%.*]] = getelementptr inbounds float, float* [[D]], i64 -2
+; CHECK-NEXT:    [[ADD_PTR45:%.*]] = getelementptr inbounds float, float* [[D]], i64 -3
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP6]] to <4 x float>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fdiv <4 x float> [[TMP7]], poison
+; CHECK-NEXT:    [[ADD_PTR53:%.*]] = getelementptr inbounds float, float* [[D]], i64 -4
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ADD_PTR53]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
index d1846d4f4818f..2d87f4fa5bea5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
@@ -6,10 +6,10 @@ define void @test(double* %isec) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[ISEC:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 0
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX10]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index 8a2e04339843b..87709a87b3692 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -17,18 +17,18 @@ define void @foo() {
 ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float>
 ; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]]
 ; CHECK-NEXT:    br label [[BB3]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
index 779e8dee295fd..b54d67fb82011 100644
--- a/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
@@ -12,9 +12,9 @@ define void @test_sideeffect(float* %p) {
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, float* [[P]], i64 3
 ; CHECK-NEXT:    call void @llvm.sideeffect()
-; CHECK-NEXT:    call void @llvm.sideeffect()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    call void @llvm.sideeffect()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@@ -45,9 +45,9 @@ define void @test_inaccessiblememonly(float* %p) {
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, float* [[P]], i64 3
 ; CHECK-NEXT:    call void @foo() #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    call void @foo() #[[ATTR1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    call void @foo() #[[ATTR1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void