[llvm] 1eeb2bf - [SLP]Do not schedule instructions with constants/argument/phi operands and external users.

Wed Mar 16 06:07:27 PDT 2022

Author: Alexey Bataev
Date: 2022-03-16T06:05:43-07:00
New Revision: 1eeb2bfe727323332800e8d390f2f8c63c953779

URL: https://github.com/llvm/llvm-project/commit/1eeb2bfe727323332800e8d390f2f8c63c953779
DIFF: https://github.com/llvm/llvm-project/commit/1eeb2bfe727323332800e8d390f2f8c63c953779.diff

LOG: [SLP]Do not schedule instructions with constants/argument/phi operands and external users.

No need to schedule entry nodes where all instructions are not memory
read/write instructions and their operands are either constants, or
arguments, or phis, or instructions from others blocks, or their users
are phis or from the other blocks.
The resulting vector instructions can be placed at
the beginning of the basic block without scheduling (if operands does
not need to be scheduled) or at the end of the block (if users are
outside of the block).
It may save some compile time and scheduling resources.

Differential Revision: https://reviews.llvm.org/D121121

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
    llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
    llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
    llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
    llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
    llvm/test/Transforms/SLPVectorizer/X86/external_user.ll
    llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
    llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
    llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
    llvm/test/Transforms/SLPVectorizer/X86/phi.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
    llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
    llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
    llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
    llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 90114f934fcaa..80b6ff7d54d8f 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -776,6 +776,57 @@ static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
       Scalars[Mask[I]] = Prev[I];
 }
 
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all operands are either not instructions
+/// or phi nodes or instructions from 
diff erent blocks.
+static bool areAllOperandsNonInsts(Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return true;
+  return !I->mayReadOrWriteMemory() && all_of(I->operands(), [I](Value *V) {
+    auto *IO = dyn_cast<Instruction>(V);
+    if (!IO)
+      return true;
+    return isa<PHINode>(IO) || IO->getParent() != I->getParent();
+  });
+}
+
+/// Checks if the provided value does not require scheduling. It does not
+/// require scheduling if this is not an instruction or it is an instruction
+/// that does not read/write memory and all users are phi nodes or instructions
+/// from the 
diff erent blocks.
+static bool isUsedOutsideBlock(Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return true;
+  // Limits the number of uses to save compile time.
+  constexpr int UsesLimit = 8;
+  return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
+         all_of(I->users(), [I](User *U) {
+           auto *IU = dyn_cast<Instruction>(U);
+           if (!IU)
+             return true;
+           return IU->getParent() != I->getParent() || isa<PHINode>(IU);
+         });
+}
+
+/// Checks if the specified value does not require scheduling. It does not
+/// require scheduling if all operands and all users do not need to be scheduled
+/// in the current basic block.
+static bool doesNotNeedToBeScheduled(Value *V) {
+  return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
+}
+
+/// Checks if the specified array of instructions does not require scheduling.
+/// It is so if all either instructions have operands that do not require
+/// scheduling or their users do not require scheduling since they are phis or
+/// in other basic blocks.
+static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
+  return !VL.empty() &&
+         (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -2359,15 +2410,21 @@ class BoUpSLP {
         ScalarToTreeEntry[V] = Last;
       }
       // Update the scheduler bundle to point to this TreeEntry.
-      unsigned Lane = 0;
-      for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
-           BundleMember = BundleMember->NextInBundle) {
-        BundleMember->TE = Last;
-        BundleMember->Lane = Lane;
-        ++Lane;
-      }
-      assert((!Bundle.getValue() || Lane == VL.size()) &&
+      ScheduleData *BundleMember = Bundle.getValue();
+      assert((BundleMember || isa<PHINode>(S.MainOp) ||
+              isVectorLikeInstWithConstOps(S.MainOp) ||
+              doesNotNeedToSchedule(VL)) &&
              "Bundle and VL out of sync");
+      if (BundleMember) {
+        for (Value *V : VL) {
+          if (doesNotNeedToBeScheduled(V))
+            continue;
+          assert(BundleMember && "Unexpected end of bundle.");
+          BundleMember->TE = Last;
+          BundleMember = BundleMember->NextInBundle;
+        }
+      }
+      assert(!BundleMember && "Bundle and VL out of sync");
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
@@ -2504,7 +2561,6 @@ class BoUpSLP {
       clearDependencies();
       OpValue = OpVal;
       TE = nullptr;
-      Lane = -1;
     }
 
     /// Verify basic self consistency properties
@@ -2544,7 +2600,7 @@ class BoUpSLP {
     /// Returns true if it represents an instruction bundle and not only a
     /// single instruction.
     bool isPartOfBundle() const {
-      return NextInBundle != nullptr || FirstInBundle != this;
+      return NextInBundle != nullptr || FirstInBundle != this || TE;
     }
 
     /// Returns true if it is ready for scheduling, i.e. it has no more
@@ -2649,9 +2705,6 @@ class BoUpSLP {
     /// Note that this is negative as long as Dependencies is not calculated.
     int UnscheduledDeps = InvalidDeps;
 
-    /// The lane of this node in the TreeEntry.
-    int Lane = -1;
-
     /// True if this instruction is scheduled (or considered as scheduled in the
     /// dry-run).
     bool IsScheduled = false;
@@ -2669,6 +2722,21 @@ class BoUpSLP {
   friend struct DOTGraphTraits<BoUpSLP *>;
 
   /// Contains all scheduling data for a basic block.
+  /// It does not schedules instructions, which are not memory read/write
+  /// instructions and their operands are either constants, or arguments, or
+  /// phis, or instructions from others blocks, or their users are phis or from
+  /// the other blocks. The resulting vector instructions can be placed at the
+  /// beginning of the basic block without scheduling (if operands does not need
+  /// to be scheduled) or at the end of the block (if users are outside of the
+  /// block). It allows to save some compile time and memory used by the
+  /// compiler.
+  /// ScheduleData is assigned for each instruction in between the boundaries of
+  /// the tree entry, even for those, which are not part of the graph. It is
+  /// required to correctly follow the dependencies between the instructions and
+  /// their correct scheduling. The ScheduleData is not allocated for the
+  /// instructions, which do not require scheduling, like phis, nodes with
+  /// extractelements/insertelements only or nodes with instructions, with
+  /// uses/operands outside of the block.
   struct BlockScheduling {
     BlockScheduling(BasicBlock *BB)
         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
@@ -2696,7 +2764,7 @@ class BoUpSLP {
       if (BB != I->getParent())
         // Avoid lookup if can't possibly be in map.
         return nullptr;
-      ScheduleData *SD = ScheduleDataMap[I];
+      ScheduleData *SD = ScheduleDataMap.lookup(I);
       if (SD && isInSchedulingRegion(SD))
         return SD;
       return nullptr;
@@ -2713,7 +2781,7 @@ class BoUpSLP {
         return getScheduleData(V);
       auto I = ExtraScheduleDataMap.find(V);
       if (I != ExtraScheduleDataMap.end()) {
-        ScheduleData *SD = I->second[Key];
+        ScheduleData *SD = I->second.lookup(Key);
         if (SD && isInSchedulingRegion(SD))
           return SD;
       }
@@ -2735,7 +2803,7 @@ class BoUpSLP {
            BundleMember = BundleMember->NextInBundle) {
         if (BundleMember->Inst != BundleMember->OpValue)
           continue;
-        
+
         // Handle the def-use chain dependencies.
 
         // Decrement the unscheduled counter and insert to ready list if ready.
@@ -2760,7 +2828,9 @@ class BoUpSLP {
         // reordered during buildTree(). We therefore need to get its operands
         // through the TreeEntry.
         if (TreeEntry *TE = BundleMember->TE) {
-          int Lane = BundleMember->Lane;
+          // Need to search for the lane since the tree entry can be reordered.
+          int Lane = std::distance(TE->Scalars.begin(),
+                                   find(TE->Scalars, BundleMember->Inst));
           assert(Lane >= 0 && "Lane not set");
 
           // Since vectorization tree is being built recursively this assertion
@@ -2769,7 +2839,7 @@ class BoUpSLP {
           // where their second (immediate) operand is not added. Since
           // immediates do not affect scheduler behavior this is considered
           // okay.
-          auto *In = TE->getMainOp();
+          auto *In = BundleMember->Inst;
           assert(In &&
                  (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
                   In->getNumOperands() == TE->getNumOperands()) &&
@@ -2814,7 +2884,8 @@ class BoUpSLP {
 
       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
         auto *SD = getScheduleData(I);
-        assert(SD && "primary scheduledata must exist in window");
+        if (!SD)
+          continue;
         assert(isInSchedulingRegion(SD) &&
                "primary schedule data not in window?");
         assert(isInSchedulingRegion(SD->FirstInBundle) &&
@@ -3856,6 +3927,22 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   return LoadsState::Gather;
 }
 
+/// \return true if the specified list of values has only one instruction that
+/// requires scheduling, false otherwise.
+static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
+  Value *NeedsScheduling = nullptr;
+  for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V))
+      continue;
+    if (!NeedsScheduling) {
+      NeedsScheduling = V;
+      continue;
+    }
+    return false;
+  }
+  return NeedsScheduling;
+}
+
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
@@ -6398,6 +6485,20 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
     return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
   }));
 
+  // Set the insert point to the beginning of the basic block if the entry
+  // should not be scheduled.
+  if (E->State != TreeEntry::NeedToGather &&
+      doesNotNeedToSchedule(E->Scalars)) {
+    BasicBlock::iterator InsertPt;
+    if (all_of(E->Scalars, isUsedOutsideBlock))
+      InsertPt = BB->getTerminator()->getIterator();
+    else
+      InsertPt = BB->getFirstInsertionPt();
+    Builder.SetInsertPoint(BB, InsertPt);
+    Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+    return;
+  }
+
   // The last instruction in the bundle in program order.
   Instruction *LastInst = nullptr;
 
@@ -6406,8 +6507,10 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   // VL.back() and iterate over schedule data until we reach the end of the
   // bundle. The end of the bundle is marked by null ScheduleData.
   if (BlocksSchedules.count(BB)) {
-    auto *Bundle =
-        BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+    Value *V = E->isOneOf(E->Scalars.back());
+    if (doesNotNeedToBeScheduled(V))
+      V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
+    auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
     if (Bundle && Bundle->isPartOfBundle())
       for (; Bundle; Bundle = Bundle->NextInBundle)
         if (Bundle->OpValue == Bundle->Inst)
@@ -7633,9 +7736,11 @@ void BoUpSLP::optimizeGatherSequence() {
 
 BoUpSLP::ScheduleData *
 BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
-  ScheduleData *Bundle = nullptr;  
+  ScheduleData *Bundle = nullptr;
   ScheduleData *PrevInBundle = nullptr;
   for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V))
+      continue;
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member "
@@ -7663,7 +7768,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                             const InstructionsState &S) {
   // No need to schedule PHIs, insertelement, extractelement and extractvalue
   // instructions.
-  if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue))
+  if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
+      doesNotNeedToSchedule(VL))
     return nullptr;
 
   // Initialize the instruction bundle.
@@ -7709,6 +7815,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
   for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V))
+      continue;
     if (!extendSchedulingRegion(V, S)) {
       // If the scheduling region got new instructions at the lower end (or it
       // is a new region for the first bundle). This makes it necessary to
@@ -7723,6 +7831,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   bool ReSchedule = false;
   for (Value *V : VL) {
+    if (doesNotNeedToBeScheduled(V))
+      continue;
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
@@ -7752,14 +7862,18 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
                                                 Value *OpValue) {
-  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue))
+  if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
+      doesNotNeedToSchedule(VL))
     return;
 
+  if (doesNotNeedToBeScheduled(OpValue))
+    OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
   ScheduleData *Bundle = getScheduleData(OpValue);
   LLVM_DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
-  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+  assert(Bundle->isSchedulingEntity() &&
+         (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
          "tried to unbundle something which is not a bundle");
 
   // Remove the bundle from the ready list.
@@ -7773,6 +7887,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
     BundleMember->FirstInBundle = BundleMember;
     ScheduleData *Next = BundleMember->NextInBundle;
     BundleMember->NextInBundle = nullptr;
+    BundleMember->TE = nullptr;
     if (BundleMember->unscheduledDepsInBundle() == 0) {
       ReadyInsts.insert(BundleMember);
     }
@@ -7796,6 +7911,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
   Instruction *I = dyn_cast<Instruction>(V);
   assert(I && "bundle member must be an instruction");
   assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
+         !doesNotNeedToBeScheduled(I) &&
          "phi nodes/insertelements/extractelements/extractvalues don't need to "
          "be scheduled");
   auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
@@ -7872,7 +7988,10 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
                                                 ScheduleData *NextLoadStore) {
   ScheduleData *CurrentLoadStore = PrevLoadStore;
   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
-    ScheduleData *SD = ScheduleDataMap[I];
+    // No need to allocate data for non-schedulable instructions.
+    if (doesNotNeedToBeScheduled(I))
+      continue;
+    ScheduleData *SD = ScheduleDataMap.lookup(I);
     if (!SD) {
       SD = allocateScheduleDataChunks();
       ScheduleDataMap[I] = SD;
@@ -8056,8 +8175,10 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
        I = I->getNextNode()) {
     BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+      TreeEntry *SDTE = getTreeEntry(SD->Inst);
       assert((isVectorLikeInstWithConstOps(SD->Inst) ||
-              SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
+              SD->isPartOfBundle() ==
+                  (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
              "scheduler and vectorizer bundle mismatch");
       SD->FirstInBundle->SchedulingPriority = Idx++;
       if (SD->isSchedulingEntity()) {

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index 536f72a73684e..ec7b03af83f8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -36,6 +36,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -85,7 +86,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -111,6 +111,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -160,7 +161,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -297,6 +297,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -346,7 +347,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
@@ -372,6 +372,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
 ; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
 ; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
@@ -421,7 +422,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
 ; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
 ; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index e9c502b6982cd..01d743fcbfe97 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -35,41 +35,14 @@ define void @PR28330(i32 %n) {
 ;
 ; MAX-COST-LABEL: @PR28330(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
-; MAX-COST-NEXT:    [[P1:%.*]] = icmp eq i8 [[P0]], 0
-; MAX-COST-NEXT:    [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
-; MAX-COST-NEXT:    [[P3:%.*]] = icmp eq i8 [[P2]], 0
-; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
-; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
-; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
-; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
-; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
-; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P20:%.*]] = add i32 [[P17]], [[P19]]
-; MAX-COST-NEXT:    [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[P21]]
-; MAX-COST-NEXT:    [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[P23]]
-; MAX-COST-NEXT:    [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[P25]]
-; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
-; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
-; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[P31]]
-; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
@@ -139,30 +112,14 @@ define void @PR32038(i32 %n) {
 ;
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
-; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
-; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
-; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
-; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
-; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
-; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
-; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
-; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
-; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
-; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
-; MAX-COST-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[P27]]
-; MAX-COST-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[P29]]
-; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP5]], -5
-; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
-; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; MAX-COST-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
 ; MAX-COST-NEXT:    br label [[FOR_BODY]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
index 39f2f885bc26b..c1451090d23c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
@@ -14,14 +14,14 @@ define void @patatino(i64 %n, i64 %i, %struct.S* %p) !dbg !7 {
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata %struct.S* [[P:%.*]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25:![0-9]+]]
 ; CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]]
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]]
-; CHECK-NEXT:    [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG29:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg [[DBG26]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA29:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA30:![0-9]+]]
 ; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG34:![0-9]+]]
 ; CHECK-NEXT:    [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG35:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg [[DBG36:![0-9]+]]
-; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA29]]
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA30]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG37:![0-9]+]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
index f6ab38bb3935e..fbd8b88f7d435 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll
@@ -11,14 +11,14 @@ define dso_local void @l() local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB11:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[I4:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef
+; CHECK-NEXT:    [[I4:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], <i16 8, i16 8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i1> [[TMP2]] to <2 x i32>
 ; CHECK-NEXT:    br label [[BB25]]
 ; CHECK:       bb11:
-; CHECK-NEXT:    [[I12:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], undef
+; CHECK-NEXT:    [[I12:%.*]] = zext i1 undef to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <2 x i64> undef, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i1> [[TMP6]] to <2 x i32>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
index 7f51dcae484ca..d15494e092c25 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -9,11 +9,11 @@ define void @test() #0 {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[DUMMY_ADD:%.*]] = add i16 0, 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], <i64 3, i64 2, i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3
+; CHECK-NEXT:    [[DUMMY_ADD:%.*]] = add i16 0, 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP2]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index 7ab610f994264..f878bda14ad84 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -10,10 +10,10 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
-; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
+; CHECK-NEXT:    store atomic i32 [[TMP4]], i32* [[VALS:%.*]] unordered, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[V44]], i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
index c0f453f296641..f323c94899320 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -7,12 +7,12 @@ target triple = "x86_64-apple-macosx10.8.0"
 define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
index 8f57fe6866bd4..935904218cf3d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -511,9 +511,9 @@ define double @bar(double* nocapture readonly %a, i32 %n) local_unnamed_addr #0
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[ADD5]] = add i32 [[I_018]], 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
+; CHECK-NEXT:    [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
index 7b914b43180ef..f6219f626bd03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -80,8 +80,8 @@ define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
 ; AVX-NEXT:    [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], <float 1.000000e+00, float 1.000000e+00>
 ; AVX-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
 ; AVX-NEXT:    [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
 ; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
+; AVX-NEXT:    [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP17]]
 ; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; AVX:       for.end:
 ; AVX-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
index de371d8895c7d..0fd6217942c1d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll
@@ -8,16 +8,17 @@ define void @exceed(double %0, double %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP8]], undef
 ; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP8]], undef
 ; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
@@ -27,15 +28,14 @@ define void @exceed(double %0, double %1) {
 ; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
 ; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP9]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef
 ; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
 ; CHECK-NEXT:    i32 0, label [[BB2:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
index 407cfbc2090e0..6c0116b174fcd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
@@ -22,8 +22,8 @@ define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 5.000000e+00, float 8.000000e+00>
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[G:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 5.000000e+00, float 8.000000e+00>
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
index 36aec7fcbf48a..066f0ff3cf406 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
@@ -24,9 +24,9 @@ define i32 @foo(i32* nocapture %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ [[TMP4:%.*]], [[FOR_BODY]] ], [ [[TMP1]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP4]] = mul nsw <4 x i32> [[TMP3]], <i32 18, i32 19, i32 12, i32 9>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_029]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4]] = mul nsw <4 x i32> [[TMP3]], <i32 18, i32 19, i32 12, i32 9>
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi <4 x i32> [ [[TMP1]], [[ENTRY]] ], [ [[TMP4]], [[FOR_BODY]] ]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll
index 012ba4da88033..df064c553e3f1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll
@@ -34,9 +34,9 @@ define double @ext_user(double* noalias nocapture %B, double* noalias nocapture
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
-; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_020]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
index 596543880d365..f5ccb59693517 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -25,22 +25,22 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1
-; CHECK-NEXT:    store i32 [[TMP7]], i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1
+; CHECK-NEXT:    store i32 [[TMP7]], i32* [[TMP8]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       sw.bb6:
 ; CHECK-NEXT:    [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0
-; CHECK-NEXT:    store i32 [[TMP11]], i32* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0
+; CHECK-NEXT:    store i32 [[TMP11]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> <i64 2, i64 2>
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       while.body.backedge:
 ; CHECK-NEXT:    [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
-; CHECK-NEXT:    [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP13]], [[SW_BB6]] ], [ [[TMP9]], [[SW_BB]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret i32 undef

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
index 1b224cb4109c1..2d05316ea48de 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
@@ -21,14 +21,14 @@ define i32 @bar(double* nocapture %A, i32 %d) {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[D:%.*]], 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[TMP7:%.*]], label [[TMP5:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[D:%.*]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP5:%.*]]
 ; CHECK:       5:
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 (...) @foo()
 ; CHECK-NEXT:    br label [[TMP7]]
 ; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP4]], <float 4.000000e+00, float 5.000000e+00>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 9.000000e+00, double 5.000000e+00>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
index 80cb197982d48..8dc4a8936b722 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -58,10 +58,10 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {
 
 define void @test2(i64* %a, i64* %b) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index df767259b4f47..e3e4fdf1ff31e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -83,9 +83,9 @@ define i32 @foo2(double* noalias nocapture %B, double* noalias nocapture %A, i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
-; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_019]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
@@ -151,7 +151,7 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00
@@ -169,18 +169,18 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <4 x float> [[TMP17]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
-; CHECK-NEXT:    [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
+; CHECK-NEXT:    [[TMP20]] = fadd <4 x float> [[TMP6]], [[TMP18]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP20]], i32 0
 ; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP20]], i32 1
 ; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP20]], i32 2
 ; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP20]], i32 3
 ; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
@@ -245,6 +245,8 @@ define float @sort_phi_type(float* nocapture readonly %A) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x float> [ <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>, [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
@@ -254,8 +256,6 @@ define float @sort_phi_type(float* nocapture readonly %A) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3
 ; CHECK-NEXT:    [[TMP9]] = fmul <4 x float> [[TMP8]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index f6dd7526e6e76..3ac598f650edd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -299,17 +299,17 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 1
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX512VL-NEXT:    store i32 [[TMP6]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
 ; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
@@ -510,10 +510,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
 ; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
 ; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
@@ -749,47 +749,47 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
-; AVX512F-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
-; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
-; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512F-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512F-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> <i64 8, i64 5>
+; AVX512F-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512F-NEXT:    [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
+; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7
+; AVX512F-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]]
 ; AVX512F-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
 ; AVX512F-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
-; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512VL-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> <i64 8, i64 5>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
+; AVX512VL-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VL-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7
+; AVX512VL-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]]
 ; AVX512VL-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
 ; AVX512VL-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index fd1c612a0696e..19fe257b32e6c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -299,17 +299,17 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_3(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
-; AVX512VL-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 1
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
+; AVX512VL-NEXT:    store i32 [[TMP6]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 3, i32 4, i32 1>
 ; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
-; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; AVX512VL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
 ; AVX512VL-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
 ; AVX512VL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
@@ -510,10 +510,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_4(
-; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
 ; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
+; AVX512VL-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
 ; AVX512VL-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
 ; AVX512VL-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
 ; AVX512VL-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
@@ -749,47 +749,47 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
-; AVX512F-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
-; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
-; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512F-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512F-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512F-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512F-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> <i64 8, i64 5>
+; AVX512F-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512F-NEXT:    [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512F-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512F-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512F-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
+; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7
+; AVX512F-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]]
 ; AVX512F-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
 ; AVX512F-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
-; AVX512VL-NEXT:    [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> <i64 8, i64 5>
-; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX512VL-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
-; AVX512VL-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX512VL-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
-; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7
-; AVX512VL-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> <i64 8, i64 5>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0
+; AVX512VL-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VL-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
+; AVX512VL-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512VL-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7
+; AVX512VL-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]]
 ; AVX512VL-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
 ; AVX512VL-NEXT:    store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
index 6e434fe97ace8..e2cc14a3b8773 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -6,12 +6,12 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i64 0
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[F]], 1
-; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F]], i64 0
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[F]], 1
+; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[VECINIT51]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
index 4301c3fd4b82a..7294b223243f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -33,30 +33,30 @@ define float @foo(float* nocapture readonly %A) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
 ; CHECK-NEXT:    [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], <float 7.000000e+00, float 8.000000e+00>
-; CHECK-NEXT:    [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul float [[TMP12]], 9.000000e+00
 ; CHECK-NEXT:    [[ADD14]] = fadd float [[B_032]], [[MUL13]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121
+; CHECK-NEXT:    [[TMP14]] = fadd <2 x float> [[TMP5]], [[TMP10]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.for.body_crit_edge:
 ; CHECK-NEXT:    [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
 ; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]]
 ; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]]
 ; CHECK-NEXT:    ret float [[ADD17]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
index a4a388e9d095c..6946ab292cdf5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
@@ -21,11 +21,11 @@ define void @foo(%class.e* %this, %struct.a* %p, i32 %add7) {
 ; CHECK-NEXT:    i32 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[G]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
 ; CHECK-NEXT:    [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[G]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       sw.epilog:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
index 0188dd74f7a00..ade84eb2c6796 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -1065,14 +1065,14 @@ define void @sitofp_16i8_16f32() #0 {
 
 define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 ; SSE-LABEL: @sitofp_4xi32_4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1
 ; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A3:%.*]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A1:%.*]], i32 1
 ; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; SSE-NEXT:    [[RES31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE-NEXT:    ret <4 x double> [[RES31]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 00e077b663b94..8c685847a1be0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1065,14 +1065,14 @@ define void @sitofp_16i8_16f32() #0 {
 
 define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 ; SSE-LABEL: @sitofp_4xi32_4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1
 ; SSE-NEXT:    [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double>
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0
-; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A3:%.*]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A1:%.*]], i32 1
 ; SSE-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double>
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; SSE-NEXT:    [[RES31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE-NEXT:    ret <4 x double> [[RES31]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
index f773910d8c614..71fb5cd201109 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
@@ -19,15 +19,15 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur
 ; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
 ; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_4]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_8]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_7]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEXT:    br label [[BLOCK1:%.*]]
 ; CHECK:       block1:
@@ -42,9 +42,9 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur
 ; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 4
 ; CHECK-NEXT:    store i32 [[LOAD_9]], i32* [[GEP_9]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    store <2 x i32> [[TMP10]], <2 x i32>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index 87709a87b3692..109c27e4f4f4e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -16,8 +16,8 @@ define void @foo() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* undef, align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
-; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
+; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
index 33ba97921e878..cc2ee73f814c9 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll
@@ -133,27 +133,27 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX256-NEXT:    br label [[BB1:%.*]]
 ; MAX256:       bb1:
 ; MAX256-NEXT:    [[I:%.*]] = fpext half [[HVAL:%.*]] to float
-; MAX256-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
-; MAX256-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
-; MAX256-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
 ; MAX256-NEXT:    [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0
 ; MAX256-NEXT:    [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer
 ; MAX256-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
 ; MAX256-NEXT:    [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer
 ; MAX256-NEXT:    [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]]
-; MAX256-NEXT:    [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]]
-; MAX256-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
-; MAX256-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer
-; MAX256-NEXT:    [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]]
-; MAX256-NEXT:    [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]]
-; MAX256-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
-; MAX256-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer
-; MAX256-NEXT:    [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]]
-; MAX256-NEXT:    [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]]
-; MAX256-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
-; MAX256-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer
-; MAX256-NEXT:    [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]]
-; MAX256-NEXT:    [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]]
+; MAX256-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
+; MAX256-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer
+; MAX256-NEXT:    [[TMP4:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]]
+; MAX256-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
+; MAX256-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> zeroinitializer
+; MAX256-NEXT:    [[TMP6:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]]
+; MAX256-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
+; MAX256-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
+; MAX256-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer
+; MAX256-NEXT:    [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]]
+; MAX256-NEXT:    [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP4]]
+; MAX256-NEXT:    [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP6]]
+; MAX256-NEXT:    [[TMP11:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]]
+; MAX256-NEXT:    [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]]
 ; MAX256-NEXT:    switch i32 undef, label [[BB5:%.*]] [
 ; MAX256-NEXT:    i32 0, label [[BB2:%.*]]
 ; MAX256-NEXT:    i32 1, label [[BB3:%.*]]
@@ -166,10 +166,10 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX256:       bb5:
 ; MAX256-NEXT:    br label [[BB2]]
 ; MAX256:       bb2:
-; MAX256-NEXT:    [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ]
-; MAX256-NEXT:    [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP13:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP11]], [[BB3]] ], [ [[TMP11]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP11]], [[BB1]] ]
+; MAX256-NEXT:    [[TMP16:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[TMP12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
 ; MAX256-NEXT:    [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7
 ; MAX256-NEXT:    store float [[TMP17]], float* undef, align 4
 ; MAX256-NEXT:    ret void
@@ -179,27 +179,27 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX1024-NEXT:    br label [[BB1:%.*]]
 ; MAX1024:       bb1:
 ; MAX1024-NEXT:    [[I:%.*]] = fpext half [[HVAL:%.*]] to float
-; MAX1024-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
-; MAX1024-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
-; MAX1024-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
 ; MAX1024-NEXT:    [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0
 ; MAX1024-NEXT:    [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer
 ; MAX1024-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0
 ; MAX1024-NEXT:    [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer
 ; MAX1024-NEXT:    [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]]
-; MAX1024-NEXT:    [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]]
-; MAX1024-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
-; MAX1024-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer
-; MAX1024-NEXT:    [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]]
-; MAX1024-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
-; MAX1024-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer
-; MAX1024-NEXT:    [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]]
-; MAX1024-NEXT:    [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]]
-; MAX1024-NEXT:    [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
-; MAX1024-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer
-; MAX1024-NEXT:    [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]]
-; MAX1024-NEXT:    [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]]
+; MAX1024-NEXT:    [[I3:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0
+; MAX1024-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer
+; MAX1024-NEXT:    [[TMP4:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]]
+; MAX1024-NEXT:    [[I6:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0
+; MAX1024-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> zeroinitializer
+; MAX1024-NEXT:    [[TMP6:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]]
+; MAX1024-NEXT:    [[I9:%.*]] = fpext half [[HVAL]] to float
+; MAX1024-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0
+; MAX1024-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer
+; MAX1024-NEXT:    [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]]
+; MAX1024-NEXT:    [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP4]]
+; MAX1024-NEXT:    [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP6]]
+; MAX1024-NEXT:    [[TMP11:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]]
+; MAX1024-NEXT:    [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]]
 ; MAX1024-NEXT:    switch i32 undef, label [[BB5:%.*]] [
 ; MAX1024-NEXT:    i32 0, label [[BB2:%.*]]
 ; MAX1024-NEXT:    i32 1, label [[BB3:%.*]]
@@ -212,10 +212,10 @@ define void @phi_float32(half %hval, float %fval) {
 ; MAX1024:       bb5:
 ; MAX1024-NEXT:    br label [[BB2]]
 ; MAX1024:       bb2:
-; MAX1024-NEXT:    [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
-; MAX1024-NEXT:    [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ]
-; MAX1024-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ]
-; MAX1024-NEXT:    [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP13:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP11]], [[BB3]] ], [ [[TMP11]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP11]], [[BB1]] ]
+; MAX1024-NEXT:    [[TMP16:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[TMP12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ]
 ; MAX1024-NEXT:    [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7
 ; MAX1024-NEXT:    store float [[TMP17]], float* undef, align 4
 ; MAX1024-NEXT:    ret void