[llvm] 17ac10c - Revert "[SLP]Initial non-power-of-2 support (but still whole register) for reductions"

Mon Oct 21 05:37:50 PDT 2024

Author: David Green
Date: 2024-10-21T13:37:44+01:00
New Revision: 17ac10c28f0a3c078a82595787da7d855e581bf1

URL: https://github.com/llvm/llvm-project/commit/17ac10c28f0a3c078a82595787da7d855e581bf1
DIFF: https://github.com/llvm/llvm-project/commit/17ac10c28f0a3c078a82595787da7d855e581bf1.diff

LOG: Revert "[SLP]Initial non-power-of-2 support (but still whole register) for reductions"

This reverts commit 7f2e937469a8cec3fe977bf41ad2dfb9b4ce648a as it causes
regressions in the tests it modifies, and undoes what was added in #100653
(which itself was a fix for a previous regression).

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
    llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
    llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
    llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
    llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
    llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e1aa6127ac037a..1098bf578d2d73 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -291,8 +291,6 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
   if (NumParts == 0 || NumParts >= Sz)
     return bit_floor(Sz);
   unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
-  if (RegVF > Sz)
-    return bit_floor(Sz);
   return (Sz / RegVF) * RegVF;
 }
 
@@ -1507,12 +1505,6 @@ class BoUpSLP {
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
 
-  /// Checks if the graph and all its subgraphs cannot be better vectorized.
-  /// It may happen, if all gather nodes are loads and they cannot be
-  /// "clusterized". In this case even subgraphs cannot be vectorized more
-  /// effectively than the base graph.
-  bool isTreeNotExtendable() const;
-
   /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
   /// can be load combined in the backend. Load combining may not be allowed in
   /// the IR optimizer, so we do not want to alter the pattern. For example,
@@ -3055,9 +3047,7 @@ class BoUpSLP {
   /// vector loads/masked gathers instead of regular gathers. Later these loads
   /// are reshufled to build final gathered nodes.
   void tryToVectorizeGatheredLoads(
-      const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
-                           SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
-                           8> &GatheredLoads);
+      ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads);
 
   /// Reorder commutative or alt operands to get better probability of
   /// generating vectorized code.
@@ -3069,7 +3059,7 @@ class BoUpSLP {
   /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
   /// users of \p TE and collects the stores. It returns the map from the store
   /// pointers to the collected stores.
-  SmallVector<SmallVector<StoreInst *>>
+  DenseMap<Value *, SmallVector<StoreInst *>>
   collectUserStores(const BoUpSLP::TreeEntry *TE) const;
 
   /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
@@ -4667,8 +4657,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
 static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
                                   const TargetLibraryInfo &TLI,
                                   bool CompareOpcodes = true) {
-  if (getUnderlyingObject(Ptr1, RecursionMaxDepth) !=
-      getUnderlyingObject(Ptr2, RecursionMaxDepth))
+  if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
     return false;
   auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
   auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
@@ -5188,40 +5177,30 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
   return LoadsState::Gather;
 }
 
-static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
-                                   ArrayRef<BasicBlock *> BBs, Type *ElemTy,
+static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
                                    const DataLayout &DL, ScalarEvolution &SE,
                                    SmallVectorImpl<unsigned> &SortedIndices) {
-  assert(
-      all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
-      "Expected list of pointer operands.");
+  assert(llvm::all_of(
+             VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
+         "Expected list of pointer operands.");
   // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
   // Ptr into, sort and return the sorted indices with values next to one
   // another.
-  SmallMapVector<std::pair<BasicBlock *, Value *>,
-                 SmallVector<SmallVector<std::tuple<Value *, int, unsigned>>>, 8>
-      Bases;
-  Bases
-      .try_emplace(std::make_pair(
-          BBs.front(), getUnderlyingObject(VL.front(), RecursionMaxDepth)))
-      .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
-
-  SortedIndices.clear();
-  for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
-    auto Key = std::make_pair(BBs[Cnt + 1],
-                              getUnderlyingObject(Ptr, RecursionMaxDepth));
-    bool Found = any_of(Bases.try_emplace(Key).first->second,
-                        [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
-                          std::optional<int> Diff = getPointersDiff(
-                              ElemTy, std::get<0>(Base.front()), ElemTy,
-                              Ptr, DL, SE,
-                              /*StrictCheck=*/true);
-                          if (!Diff)
-                            return false;
+  MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
+  Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
+
+  unsigned Cnt = 1;
+  for (Value *Ptr : VL.drop_front()) {
+    bool Found = any_of(Bases, [&](auto &Base) {
+      std::optional<int> Diff =
+          getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
+                          /*StrictCheck=*/true);
+      if (!Diff)
+        return false;
 
-                          Base.emplace_back(Ptr, *Diff, Cnt + 1);
-                          return true;
-                        });
+      Base.second.emplace_back(Ptr, *Diff, Cnt++);
+      return true;
+    });
 
     if (!Found) {
       // If we haven't found enough to usefully cluster, return early.
@@ -5229,39 +5208,71 @@ static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
         return false;
 
       // Not found already - add a new Base
-      Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
+      Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
     }
   }
 
   // For each of the bases sort the pointers by Offset and check if any of the
   // base become consecutively allocated.
+  bool AnyConsecutive = false;
   for (auto &Base : Bases) {
-    for (auto &Vec : Base.second) {
-      if (Vec.size() > 1) {
-        stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
-                            const std::tuple<Value *, int, unsigned> &Y) {
-          return std::get<1>(X) < std::get<1>(Y);
-        });
-        int InitialOffset = std::get<1>(Vec[0]);
-        bool AnyConsecutive =
-            all_of(enumerate(Vec), [InitialOffset](const auto &P) {
-              return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
-            });
-        // Fill SortedIndices array only if it looks worth-while to sort the
-        // ptrs.
-        if (!AnyConsecutive)
-          return false;
-      }
+    auto &Vec = Base.second;
+    if (Vec.size() > 1) {
+      llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
+                                const std::tuple<Value *, int, unsigned> &Y) {
+        return std::get<1>(X) < std::get<1>(Y);
+      });
+      int InitialOffset = std::get<1>(Vec[0]);
+      AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
+        return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
+      });
     }
-    sort(Base.second, [](const auto &V1, const auto &V2) {
-      return std::get<2>(V1.front()) < std::get<2>(V2.front());
+  }
+
+  // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
+  SortedIndices.clear();
+  if (!AnyConsecutive)
+    return false;
+
+  // If we have a better order, also sort the base pointers by increasing
+  // (variable) values if possible, to try and keep the order more regular. In
+  // order to create a valid strict-weak order we cluster by the Root of gep
+  // chains and sort within each.
+  SmallVector<std::tuple<Value *, Value *, Value *>> SortedBases;
+  for (auto &Base : Bases) {
+    Value *Strip = Base.first->stripInBoundsConstantOffsets();
+    Value *Root = Strip;
+    while (auto *Gep = dyn_cast<GetElementPtrInst>(Root))
+      Root = Gep->getOperand(0);
+    SortedBases.emplace_back(Base.first, Strip, Root);
+  }
+  auto *Begin = SortedBases.begin();
+  auto *End = SortedBases.end();
+  while (Begin != End) {
+    Value *Root = std::get<2>(*Begin);
+    auto *Mid = std::stable_partition(
+        Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
+    DenseMap<Value *, DenseMap<Value *, bool>> LessThan;
+    for (auto *I = Begin; I < Mid; ++I)
+      LessThan.try_emplace(std::get<1>(*I));
+    for (auto *I = Begin; I < Mid; ++I) {
+      Value *V = std::get<1>(*I);
+      while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
+        V = Gep->getOperand(0);
+        if (LessThan.contains(V))
+          LessThan[V][std::get<1>(*I)] = true;
+      }
+    }
+    std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) {
+      return LessThan[std::get<1>(V1)][std::get<1>(V2)];
     });
+    Begin = Mid;
   }
 
-  for (auto &T : Bases)
-    for (const auto &Vec : T.second)
-      for (const auto &P : Vec)
-        SortedIndices.push_back(std::get<2>(P));
+  // Collect the final order of sorted indices
+  for (auto Base : SortedBases)
+    for (auto &T : Bases[std::get<0>(Base)])
+      SortedIndices.push_back(std::get<2>(T));
 
   assert(SortedIndices.size() == VL.size() &&
          "Expected SortedIndices to be the size of VL");
@@ -5275,18 +5286,15 @@ BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
 
   SmallVector<Value *> Ptrs;
   Ptrs.reserve(TE.Scalars.size());
-  SmallVector<BasicBlock *> BBs;
-  BBs.reserve(TE.Scalars.size());
   for (Value *V : TE.Scalars) {
     auto *L = dyn_cast<LoadInst>(V);
     if (!L || !L->isSimple())
       return std::nullopt;
     Ptrs.push_back(L->getPointerOperand());
-    BBs.push_back(L->getParent());
   }
 
   BoUpSLP::OrdersType Order;
-  if (clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
+  if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
     return std::move(Order);
   return std::nullopt;
 }
@@ -5654,7 +5662,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     }
     // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
     // has been auditted for correctness with non-power-of-two vectors.
-    if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
+    if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
       if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
         return CurrentOrder;
   }
@@ -6385,15 +6393,13 @@ void BoUpSLP::buildExternalUses(
   }
 }
 
-SmallVector<SmallVector<StoreInst *>>
+DenseMap<Value *, SmallVector<StoreInst *>>
 BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
-  SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
-                SmallVector<StoreInst *>, 8>
-      PtrToStoresMap;
+  DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
   for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
     Value *V = TE->Scalars[Lane];
     // Don't iterate over the users of constant data.
-    if (!isa<Instruction>(V))
+    if (isa<ConstantData>(V))
       continue;
     // To save compilation time we don't visit if we have too many users.
     if (V->hasNUsesOrMore(UsesLimit))
@@ -6411,34 +6417,25 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
       if (getTreeEntry(U))
         continue;
 
-      Value *Ptr =
-          getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
-      auto &StoresVec = PtrToStoresMap[{SI->getParent(),
-                                        SI->getValueOperand()->getType(), Ptr}];
+      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
+      auto &StoresVec = PtrToStoresMap[Ptr];
       // For now just keep one store per pointer object per lane.
       // TODO: Extend this to support multiple stores per pointer per lane
       if (StoresVec.size() > Lane)
         continue;
-      if (!StoresVec.empty()) {
-        std::optional<int> Diff = getPointersDiff(
-            SI->getValueOperand()->getType(), SI->getPointerOperand(),
-            SI->getValueOperand()->getType(),
-            StoresVec.front()->getPointerOperand(), *DL, *SE,
-            /*StrictCheck=*/true);
-        // We failed to compare the pointers so just abandon this store.
-        if (!Diff)
-          continue;
-      }
+      // Skip if in 
diff erent BBs.
+      if (!StoresVec.empty() &&
+          SI->getParent() != StoresVec.back()->getParent())
+        continue;
+      // Make sure that the stores are of the same type.
+      if (!StoresVec.empty() &&
+          SI->getValueOperand()->getType() !=
+              StoresVec.back()->getValueOperand()->getType())
+        continue;
       StoresVec.push_back(SI);
     }
   }
-  SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
-  unsigned I = 0;
-  for (auto &P : PtrToStoresMap) {
-    Res[I].swap(P.second);
-    ++I;
-  }
-  return Res;
+  return PtrToStoresMap;
 }
 
 bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
@@ -6448,9 +6445,9 @@ bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
 
   // To avoid calling getPointersDiff() while sorting we create a vector of
   // pairs {store, offset from first} and sort this instead.
-  SmallVector<std::pair<int, unsigned>> StoreOffsetVec;
+  SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
   StoreInst *S0 = StoresVec[0];
-  StoreOffsetVec.emplace_back(0, 0);
+  StoreOffsetVec[0] = {S0, 0};
   Type *S0Ty = S0->getValueOperand()->getType();
   Value *S0Ptr = S0->getPointerOperand();
   for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
@@ -6459,36 +6456,41 @@ bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
         getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
                         SI->getPointerOperand(), *DL, *SE,
                         /*StrictCheck=*/true);
-    StoreOffsetVec.emplace_back(*Diff, Idx);
+    // We failed to compare the pointers so just abandon this StoresVec.
+    if (!Diff)
+      return false;
+    StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
   }
 
+  // Sort the vector based on the pointers. We create a copy because we may
+  // need the original later for calculating the reorder (shuffle) indices.
+  stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
+                                 const std::pair<StoreInst *, int> &Pair2) {
+    int Offset1 = Pair1.second;
+    int Offset2 = Pair2.second;
+    return Offset1 < Offset2;
+  });
+
   // Check if the stores are consecutive by checking if their 
diff erence is 1.
-  if (StoreOffsetVec.size() != StoresVec.size())
-    return false;
-  sort(StoreOffsetVec,
-       [](const std::pair<int, unsigned> &L,
-          const std::pair<int, unsigned> &R) { return L.first < R.first; });
-  unsigned Idx = 0;
-  int PrevDist = 0;
-  for (const auto &P : StoreOffsetVec) {
-    if (Idx > 0 && P.first != PrevDist + 1)
+  for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
+    if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
       return false;
-    PrevDist = P.first;
-    ++Idx;
-  }
 
   // Calculate the shuffle indices according to their offset against the sorted
   // StoreOffsetVec.
-  ReorderIndices.assign(StoresVec.size(), 0);
-  bool IsIdentity = true;
-  for (auto [I, P] : enumerate(StoreOffsetVec)) {
-    ReorderIndices[P.second] = I;
-    IsIdentity &= P.second == I;
+  ReorderIndices.reserve(StoresVec.size());
+  for (StoreInst *SI : StoresVec) {
+    unsigned Idx = find_if(StoreOffsetVec,
+                           [SI](const std::pair<StoreInst *, int> &Pair) {
+                             return Pair.first == SI;
+                           }) -
+                   StoreOffsetVec.begin();
+    ReorderIndices.push_back(Idx);
   }
   // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
   // reorderTopToBottom() and reorderBottomToTop(), so we are following the
   // same convention here.
-  if (IsIdentity)
+  if (isIdentityOrder(ReorderIndices))
     ReorderIndices.clear();
 
   return true;
@@ -6506,7 +6508,8 @@ SmallVector<BoUpSLP::OrdersType, 1>
 BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
   unsigned NumLanes = TE->Scalars.size();
 
-  SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
+  DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
+      collectUserStores(TE);
 
   // Holds the reorder indices for each candidate store vector that is a user of
   // the current TreeEntry.
@@ -6515,7 +6518,8 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
   // Now inspect the stores collected per pointer and look for vectorization
   // candidates. For each candidate calculate the reorder index vector and push
   // it into `ExternalReorderIndices`
-  for (ArrayRef<StoreInst *> StoresVec : Stores) {
+  for (const auto &Pair : PtrToStoresMap) {
+    auto &StoresVec = Pair.second;
     // If we have fewer than NumLanes stores, then we can't form a vector.
     if (StoresVec.size() != NumLanes)
       continue;
@@ -6570,13 +6574,9 @@ static void gatherPossiblyVectorizableLoads(
       continue;
     bool IsFound = false;
     for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
-      assert(LI->getParent() == Data.front().first->getParent() &&
-             LI->getType() == Data.front().first->getType() &&
-             getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
-                 getUnderlyingObject(Data.front().first->getPointerOperand(),
-                                     RecursionMaxDepth) &&
-             "Expected loads with the same type, same parent and same "
-             "underlying pointer.");
+      if (LI->getParent() != Data.front().first->getParent() ||
+          LI->getType() != Data.front().first->getType())
+        continue;
       std::optional<int> Dist = getPointersDiff(
           LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
           Data.front().first->getPointerOperand(), DL, SE,
@@ -6704,9 +6704,7 @@ static void gatherPossiblyVectorizableLoads(
 }
 
 void BoUpSLP::tryToVectorizeGatheredLoads(
-    const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
-                         SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
-                         8> &GatheredLoads) {
+    ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
   GatheredLoadsEntriesFirst = VectorizableTree.size();
 
   SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
@@ -6739,10 +6737,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
     SmallVector<int> CandidateVFs;
     if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
       CandidateVFs.push_back(MaxVF);
-    for (int NumElts = getFloorFullVectorNumberOfElements(
-             *TTI, Loads.front()->getType(), MaxVF);
-         NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
-                          *TTI, Loads.front()->getType(), NumElts - 1)) {
+    for (int NumElts = bit_floor(MaxVF); NumElts > 1; NumElts /= 2) {
       CandidateVFs.push_back(NumElts);
       if (VectorizeNonPowerOf2 && NumElts > 2)
         CandidateVFs.push_back(NumElts - 1);
@@ -6756,10 +6751,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
       if (Final && NumElts > BestVF)
         continue;
       SmallVector<unsigned> MaskedGatherVectorized;
-      for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
+      for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E;
            ++Cnt) {
-        ArrayRef<LoadInst *> Slice =
-            ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
+        ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(Cnt, NumElts);
         if (VectorizedLoads.count(Slice.front()) ||
             VectorizedLoads.count(Slice.back()) ||
             areKnownNonVectorizableLoads(Slice))
@@ -7105,27 +7099,24 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
         }
         return NonVectorized;
       };
-  for (const auto &GLs : GatheredLoads) {
-    const auto &Ref = GLs.second;
-    SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
-    if (!Ref.empty() && !NonVectorized.empty() &&
-        std::accumulate(
-            Ref.begin(), Ref.end(), 0u,
-            [](unsigned S, ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
-              return S + LoadsDists.size();
-            }) != NonVectorized.size() &&
-        IsMaskedGatherSupported(NonVectorized)) {
-      SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
-      for (LoadInst *LI : NonVectorized) {
-        // Reinsert non-vectorized loads to other list of loads with the same
-        // base pointers.
-        gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
-                                        FinalGatheredLoads,
-                                        /*AddNew=*/false);
-      }
-      // Final attempt to vectorize non-vectorized loads.
-      (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
-    }
+  SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(GatheredLoads);
+  if (!GatheredLoads.empty() && !NonVectorized.empty() &&
+      std::accumulate(
+          GatheredLoads.begin(), GatheredLoads.end(), 0u,
+          [](unsigned S, ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
+            return S + LoadsDists.size();
+          }) != NonVectorized.size() &&
+      IsMaskedGatherSupported(NonVectorized)) {
+    SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
+    for (LoadInst *LI : NonVectorized) {
+      // Reinsert non-vectorized loads to other list of loads with the same
+      // base pointers.
+      gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
+                                      FinalGatheredLoads,
+                                      /*AddNew=*/false);
+    }
+    // Final attempt to vectorize non-vectorized loads.
+    (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
   }
   // Try to vectorize postponed load entries, previously marked as gathered.
   for (unsigned Idx : LoadEntriesToVectorize) {
@@ -7372,6 +7363,13 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {
   assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
 
+  if (S.MainOp->getType()->isFloatingPointTy() &&
+      TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
+        auto *I = dyn_cast<Instruction>(V);
+        return I && (I->isBinaryOp() || isa<CallInst>(I)) && !I->isFast();
+      }))
+    return TreeEntry::NeedToGather;
+
   unsigned ShuffleOrOp =
       S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
   auto *VL0 = cast<Instruction>(S.OpValue);
@@ -7536,12 +7534,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::Freeze:
-    if (S.MainOp->getType()->isFloatingPointTy() &&
-        TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
-          auto *I = dyn_cast<Instruction>(V);
-          return I && I->isBinaryOp() && !I->isFast();
-        }))
-      return TreeEntry::NeedToGather;
     return TreeEntry::Vectorize;
   case Instruction::GetElementPtr: {
     // We don't combine GEPs with complicated (nested) indexing.
@@ -7633,12 +7625,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     return TreeEntry::NeedToGather;
   }
   case Instruction::Call: {
-    if (S.MainOp->getType()->isFloatingPointTy() &&
-        TTI->isFPVectorizationPotentiallyUnsafe() && any_of(VL, [](Value *V) {
-          auto *I = dyn_cast<Instruction>(V);
-          return I && !I->isFast();
-        }))
-      return TreeEntry::NeedToGather;
     // Check if the calls are all to the same vectorizable intrinsic or
     // library function.
     CallInst *CI = cast<CallInst>(VL0);
@@ -9358,13 +9344,8 @@ void BoUpSLP::transformNodes() {
       // insertvector instructions.
       unsigned StartIdx = 0;
       unsigned End = VL.size();
-      for (unsigned VF = getFloorFullVectorNumberOfElements(
-               *TTI, VL.front()->getType(), VL.size() - 1);
-           VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
-                            *TTI, VL.front()->getType(), VF - 1)) {
-        if (StartIdx + VF > End)
-          continue;
-        SmallVector<std::pair<unsigned, unsigned>> Slices;
+      for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) {
+        SmallVector<unsigned> Slices;
         for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
           ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
           // If any instruction is vectorized already - do not try again.
@@ -9394,10 +9375,7 @@ void BoUpSLP::transformNodes() {
             if (IsSplat)
               continue;
             InstructionsState S = getSameOpcode(Slice, *TLI);
-            if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice) ||
-                (S.getOpcode() == Instruction::Load &&
-                 areKnownNonVectorizableLoads(Slice)) ||
-                (S.getOpcode() != Instruction::Load && !has_single_bit(VF)))
+            if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice))
               continue;
             if (VF == 2) {
               // Try to vectorize reduced values or if all users are vectorized.
@@ -9417,16 +9395,8 @@ void BoUpSLP::transformNodes() {
                     canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
                 // Do not vectorize gathers.
                 if (Res == LoadsState::ScatterVectorize ||
-                    Res == LoadsState::Gather) {
-                  if (Res == LoadsState::Gather) {
-                    registerNonVectorizableLoads(Slice);
-                    // If reductions and the scalars from the root node are
-                    // analyzed - mark as non-vectorizable reduction.
-                    if (UserIgnoreList && E.Idx == 0)
-                      analyzedReductionVals(Slice);
-                  }
+                    Res == LoadsState::Gather)
                   continue;
-                }
               } else if (S.getOpcode() == Instruction::ExtractElement ||
                          (TTI->getInstructionCost(
                               cast<Instruction>(Slice.front()), CostKind) <
@@ -9441,17 +9411,17 @@ void BoUpSLP::transformNodes() {
               }
             }
           }
-          Slices.emplace_back(Cnt, Slice.size());
+          Slices.emplace_back(Cnt);
         }
-        auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
+        auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) {
           E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
           if (StartIdx == Cnt)
-            StartIdx = Cnt + Sz;
-          if (End == Cnt + Sz)
+            StartIdx = Cnt + VF;
+          if (End == Cnt + VF)
             End = Cnt;
         };
-        for (auto [Cnt, Sz] : Slices) {
-          ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
+        for (unsigned Cnt : Slices) {
+          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
           // If any instruction is vectorized already - do not try again.
           if (TreeEntry *SE = getTreeEntry(Slice.front());
               SE || getTreeEntry(Slice.back())) {
@@ -9460,7 +9430,7 @@ void BoUpSLP::transformNodes() {
             if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
               continue;
             SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
-            AddCombinedNode(SE->Idx, Cnt, Sz);
+            AddCombinedNode(SE->Idx, Cnt);
             continue;
           }
           unsigned PrevSize = VectorizableTree.size();
@@ -9472,14 +9442,12 @@ void BoUpSLP::transformNodes() {
               VectorizableTree[PrevSize]->getOpcode() !=
                   Instruction::ExtractElement &&
               !isSplat(Slice)) {
-            if (UserIgnoreList && E.Idx == 0 && VF == 2)
-              analyzedReductionVals(Slice);
             VectorizableTree.pop_back();
             assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
                    "LoadEntriesToVectorize expected to remain the same");
             continue;
           }
-          AddCombinedNode(PrevSize, Cnt, Sz);
+          AddCombinedNode(PrevSize, Cnt);
         }
       }
     }
@@ -9574,24 +9542,11 @@ void BoUpSLP::transformNodes() {
          VectorizableTree.front()->Scalars.size() == SmallVF) ||
         (VectorizableTree.size() <= 2 && UserIgnoreList))
       return;
-
-    if (VectorizableTree.front()->isNonPowOf2Vec() &&
-        getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
-        getCanonicalGraphSize() <= SmallTree &&
-        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
-                 [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
-                          TE->getOpcode() == Instruction::Load &&
-                          !allSameBlock(TE->Scalars);
-                 }) == 1)
-      return;
   }
 
   // A list of loads to be gathered during the vectorization process. We can
   // try to vectorize them at the end, if profitable.
-  SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
-                 SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8>
-      GatheredLoads;
+  SmallVector<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads;
 
   for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     TreeEntry &E = *TE;
@@ -9603,21 +9558,9 @@ void BoUpSLP::transformNodes() {
                                             !isVectorized(V) &&
                                             !isDeleted(cast<Instruction>(V));
                                    }))) &&
-        !isSplat(E.Scalars)) {
-      for (Value *V : E.Scalars) {
-        auto *LI = dyn_cast<LoadInst>(V);
-        if (!LI)
-          continue;
-        if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
-          continue;
-        gatherPossiblyVectorizableLoads(
-            *this, V, *DL, *SE, *TTI,
-            GatheredLoads[std::make_tuple(
-                LI->getParent(),
-                getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth),
-                LI->getType())]);
-      }
-    }
+        !isSplat(E.Scalars))
+      gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI,
+                                      GatheredLoads);
   }
   // Try to vectorize gathered loads if this is not just a gather of loads.
   if (!GatheredLoads.empty())
@@ -11572,34 +11515,6 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
   return true;
 }
 
-bool BoUpSLP::isTreeNotExtendable() const {
-  if (getCanonicalGraphSize() != getTreeSize()) {
-    constexpr unsigned SmallTree = 3;
-    if (VectorizableTree.front()->isNonPowOf2Vec() &&
-        getCanonicalGraphSize() <= SmallTree &&
-        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
-                 [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
-                          TE->getOpcode() == Instruction::Load &&
-                          !allSameBlock(TE->Scalars);
-                 }) == 1)
-      return true;
-    return false;
-  }
-  bool Res = false;
-  for (unsigned Idx : seq<unsigned>(getTreeSize())) {
-    TreeEntry &E = *VectorizableTree[Idx];
-    if (!E.isGather())
-      continue;
-    if (E.getOpcode() && E.getOpcode() != Instruction::Load)
-      return false;
-    if (isSplat(E.Scalars) || allConstant(E.Scalars))
-      continue;
-    Res = true;
-  }
-  return Res;
-}
-
 InstructionCost BoUpSLP::getSpillCost() const {
   // Walk from the bottom of the tree to the top, tracking which values are
   // live. When we see a call instruction that is not part of our tree,
@@ -18856,8 +18771,7 @@ class HorizontalReduction {
 
     auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
       Key = hash_combine(hash_value(LI->getParent()), Key);
-      Value *Ptr =
-          getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth);
+      Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
       if (!LoadKeyUsed.insert(Key).second) {
         auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
         if (LIt != LoadsMap.end()) {
@@ -19180,28 +19094,8 @@ class HorizontalReduction {
           RegMaxNumber * RedValsMaxNumber);
 
       unsigned ReduxWidth = NumReducedVals;
-      auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
-        unsigned NumParts, NumRegs;
-        Type *ScalarTy = Candidates.front()->getType();
-        ReduxWidth =
-            getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
-        VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
-        NumParts = TTI.getNumberOfParts(Tp);
-        NumRegs =
-            TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
-        while (NumParts > NumRegs) {
-          ReduxWidth = bit_floor(ReduxWidth - 1);
-          VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
-          NumParts = TTI.getNumberOfParts(Tp);
-          NumRegs =
-              TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
-        }
-        if (NumParts > NumRegs / 2)
-          ReduxWidth = bit_floor(ReduxWidth);
-        return ReduxWidth;
-      };
       if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
-        ReduxWidth = GetVectorFactor(ReduxWidth);
+        ReduxWidth = bit_floor(ReduxWidth);
       ReduxWidth = std::min(ReduxWidth, MaxElts);
 
       unsigned Start = 0;
@@ -19209,7 +19103,10 @@ class HorizontalReduction {
       // Restarts vectorization attempt with lower vector factor.
       unsigned PrevReduxWidth = ReduxWidth;
       bool CheckForReusedReductionOpsLocal = false;
-      auto AdjustReducedVals = [&](bool IgnoreVL = false) {
+      auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+                                  &CheckForReusedReductionOpsLocal,
+                                  &PrevReduxWidth, &V,
+                                  &IgnoreList](bool IgnoreVL = false) {
         bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
         if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
           // Check if any of the reduction ops are gathered. If so, worth
@@ -19220,13 +19117,10 @@ class HorizontalReduction {
         if (Pos < NumReducedVals - ReduxWidth + 1)
           return IsAnyRedOpGathered;
         Pos = Start;
-        --ReduxWidth;
-        if (ReduxWidth > 1)
-          ReduxWidth = GetVectorFactor(ReduxWidth);
+        ReduxWidth = bit_ceil(ReduxWidth) / 2;
         return IsAnyRedOpGathered;
       };
       bool AnyVectorized = false;
-      SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
       while (Pos < NumReducedVals - ReduxWidth + 1 &&
              ReduxWidth >= ReductionLimit) {
         // Dependency in tree of the reduction ops - drop this attempt, try
@@ -19238,15 +19132,8 @@ class HorizontalReduction {
         }
         PrevReduxWidth = ReduxWidth;
         ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
-        // Been analyzed already - skip.
-        if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
-            (!has_single_bit(ReduxWidth) &&
-             (IgnoredCandidates.contains(
-                  std::make_pair(Pos, bit_floor(ReduxWidth))) ||
-              IgnoredCandidates.contains(
-                  std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
-                                 bit_floor(ReduxWidth))))) ||
-            V.areAnalyzedReductionVals(VL)) {
+        // Beeing analyzed already - skip.
+        if (V.areAnalyzedReductionVals(VL)) {
           (void)AdjustReducedVals(/*IgnoreVL=*/true);
           continue;
         }
@@ -19352,24 +19239,8 @@ class HorizontalReduction {
                    << " and threshold "
                    << ore::NV("Threshold", -SLPCostThreshold);
           });
-          if (!AdjustReducedVals()) {
+          if (!AdjustReducedVals())
             V.analyzedReductionVals(VL);
-            unsigned Offset = Pos == Start ? Pos : Pos - 1;
-            if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
-              // Add subvectors of VL to the list of the analyzed values.
-              for (unsigned VF = getFloorFullVectorNumberOfElements(
-                       *TTI, VL.front()->getType(), ReduxWidth - 1);
-                   VF >= ReductionLimit;
-                   VF = getFloorFullVectorNumberOfElements(
-                       *TTI, VL.front()->getType(), VF - 1)) {
-                if (has_single_bit(VF) &&
-                    V.getCanonicalGraphSize() != V.getTreeSize())
-                  continue;
-                for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
-                  IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
-              }
-            }
-          }
           continue;
         }
 
@@ -19478,9 +19349,7 @@ class HorizontalReduction {
         }
         Pos += ReduxWidth;
         Start = Pos;
-        ReduxWidth = NumReducedVals - Pos;
-        if (ReduxWidth > 1)
-          ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
+        ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
         AnyVectorized = true;
       }
       if (OptReusedScalars && !AnyVectorized) {

diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 354791ddd6dee6..2121775224098e 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -43,32 +43,32 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP34]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]]
@@ -86,19 +86,19 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32
 ; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <16 x i32> [[TMP48]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP53]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = sub nsw <16 x i32> [[TMP50]], [[TMP52]]
-; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 2, i32 5, i32 6, i32 17, i32 18, i32 21, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 3, i32 4, i32 7, i32 16, i32 19, i32 20, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 3, i32 4, i32 7, i32 16, i32 19, i32 20, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 2, i32 5, i32 6, i32 17, i32 18, i32 21, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP54]], <16 x i32> [[TMP55]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw <16 x i32> [[TMP57]], [[TMP59]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP56]], [[TMP58]]
-; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP65]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP62]], [[TMP64]]
-; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 20, i32 16, i32 21, i32 17, i32 22, i32 18, i32 23, i32 19>
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP69:%.*]] = lshr <16 x i32> [[TMP68]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP70:%.*]] = and <16 x i32> [[TMP69]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP71:%.*]] = mul nuw <16 x i32> [[TMP70]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index b16164c4e5ff8b..5f0b16048d40c8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -1215,26 +1215,26 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -1242,14 +1242,14 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -1262,7 +1262,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 18, i32 22, i32 26, i32 30, i32 1, i32 5, i32 9, i32 13, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
 ; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index c431b058f0d2d5..fffa626cae0dda 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -17,17 +17,18 @@
 
 define void @s116_modified(ptr %a) {
 ; CHECK-LABEL: @s116_modified(
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[GEP1]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
index 2191d04cd797d4..833bc56c4ec6b6 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
@@ -7,7 +7,8 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
 ; NON-POWER-OF-2-NEXT:  entry:
 ; NON-POWER-OF-2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
 ; NON-POWER-OF-2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
-; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
+; NON-POWER-OF-2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
 ; NON-POWER-OF-2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
 ; NON-POWER-OF-2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
 ; NON-POWER-OF-2-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
index 234b658032383e..757d0b1708b6fb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
@@ -11,21 +11,19 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[Y0]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[I7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP11]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0)
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[RC21]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[RC21]], align 4
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
 ; CHECK:       entry.if.end72_crit_edge:
 ; CHECK-NEXT:    br label [[IF_END72:%.*]]
@@ -48,7 +46,8 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
-; CHECK-NEXT:    store <4 x float> [[TMP26]], ptr [[RC21]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[RC21]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index c9ff2d6426d2b6..72e29839230e81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -318,14 +318,22 @@ entry:
 define float @f(ptr nocapture readonly %x) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
 ; THRESHOLD-LABEL: @f(
 ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
   entry:
@@ -598,14 +606,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
 ; CHECK-LABEL: @loadadd31(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -615,14 +627,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
 ; THRESHOLD-LABEL: @loadadd31(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
 ; THRESHOLD-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
 ; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
 ; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 0bc91d42b0f132..a7201e776fb4cd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1013,11 +1013,11 @@ define i32 @maxi8_wrong_parent(i32) {
 ; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
 ; THRESH-NEXT:    br label [[PP:%.*]]
 ; THRESH:       pp:
-; THRESH-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; THRESH-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0)
-; THRESH-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP4]], i64 4)
-; THRESH-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
+; THRESH-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
+; THRESH-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4)
+; THRESH-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0)
+; THRESH-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2)
 ; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
 ; THRESH-NEXT:    ret i32 [[TMP8]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
index 4898111960c0fd..47dd84c7f6e9cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
@@ -7,11 +7,10 @@ define void @e(ptr %c, i64 %0) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C]], align 8
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 104
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 112
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX5]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP18]], <2 x ptr> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2
@@ -19,7 +18,7 @@ define void @e(ptr %c, i64 %0) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP8]], <2 x ptr> [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <6 x ptr> @llvm.vector.insert.v6p0.v2p0(<6 x ptr> [[TMP9]], <2 x ptr> [[TMP6]], i64 4)
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i64> [[TMP13]], <32 x i64> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <32 x i64> [[TMP14]], [[TMP12]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
index 2623b7689f4fc2..93258f2975f341 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll
@@ -8,6 +8,197 @@
 ; into bb1, vectorizing all the way to the broadcast load at the top.
 ; The stores in bb1 are external to this tree, but they are vectorizable and are
 ; in reverse order.
+define void @rotate_with_external_users(ptr %A, ptr %ptr) {
+; CHECK-LABEL: @rotate_with_external_users(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SHUFFLE]], <double 2.200000e+00, double 1.100000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 2.200000e+00, double 1.100000e+00>
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], <double 4.400000e+00, double 3.300000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[SEED:%.*]] = fcmp ogt double [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    ret void
+;
+bb1:
+  %ld = load double, ptr undef
+
+  %add1 = fadd double %ld, 1.1
+  %add2 = fadd double %ld, 2.2
+
+  %mul1 = fmul double %add1, 1.1
+  %mul2 = fmul double %add2, 2.2
+
+  ; Thes are external vectorizable stores with operands in reverse order.
+  %ptrA2 = getelementptr inbounds double, ptr %A, i64 1
+  store double %mul2, ptr %A
+  store double %mul1, ptr %ptrA2
+  br label %bb2
+
+bb2:
+  %add3 = fadd double %mul1, 3.3
+  %add4 = fadd double %mul2, 4.4
+  %seed = fcmp ogt double %add3, %add4
+  ret void
+}
+
+; This checks that non-consecutive external users are skipped.
+define void @non_consecutive_external_users(ptr %A, ptr %ptr) {
+; CHECK-LABEL: @non_consecutive_external_users(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[LD]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[SHUFFLE]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP1]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP2]], <double 1.100000e+00, double 2.200000e+00, double 3.300000e+00, double 4.400000e+00>
+; CHECK-NEXT:    [[PTRA4:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; CHECK-NEXT:    store double [[TMP4]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; CHECK-NEXT:    store double [[TMP5]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    store double [[TMP6]], ptr [[PTRA4]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    store double [[TMP7]], ptr [[PTRA4]], align 8
+; CHECK-NEXT:    br label [[SEED_LOOP:%.*]]
+; CHECK:       seed_loop:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x double> [ [[TMP3]], [[BB1:%.*]] ], [ zeroinitializer, [[SEED_LOOP]] ]
+; CHECK-NEXT:    br label [[SEED_LOOP]]
+;
+bb1:
+  %ld = load double, ptr undef
+
+  %add5 = fadd double %ld, 1.1
+  %add6 = fadd double %ld, 2.2
+  %add7 = fadd double %ld, 3.3
+  %add8 = fadd double %ld, 4.4
+
+  %add1 = fadd double %add5, 1.1
+  %add2 = fadd double %add6, 2.2
+  %add3 = fadd double %add7, 3.3
+  %add4 = fadd double %add8, 4.4
+
+  %mul1 = fmul double %add1, 1.1
+  %mul2 = fmul double %add2, 2.2
+  %mul3 = fmul double %add3, 3.3
+  %mul4 = fmul double %add4, 4.4
+
+  ; External non-consecutive stores.
+  %ptrA4 = getelementptr inbounds double, ptr %A, i64 3
+  store double %mul4, ptr %A
+  store double %mul3, ptr %A
+  store double %mul2, ptr %ptrA4
+  store double %mul1, ptr %ptrA4
+  br label %seed_loop
+
+seed_loop:
+  %phi1 = phi double [ %mul1, %bb1 ], [ 0.0, %seed_loop ]
+  %phi2 = phi double [ %mul2, %bb1 ], [ 0.0, %seed_loop ]
+  %phi3 = phi double [ %mul3, %bb1 ], [ 0.0, %seed_loop ]
+  %phi4 = phi double [ %mul4, %bb1 ], [ 0.0, %seed_loop ]
+  br label %seed_loop
+}
+
+; We have to be careful when the tree contains add/sub patterns that could be
+; combined into a single addsub instruction. Reordering can block the pattern.
+define void @addsub_and_external_users(ptr %A, ptr %ptr) {
+; CHECK-LABEL: @addsub_and_external_users(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.100000e+00, double 2.200000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.100000e+00, double 3.200000e+00>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    store <2 x double> [[SHUFFLE1]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], <double 4.100000e+00, double 4.200000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[SEED:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    ret void
+;
+bb1:
+  %ld = load double, ptr undef
+
+  %sub1 = fsub double %ld, 1.1
+  %add2 = fadd double %ld, 1.2
+
+  %div1 = fdiv double %sub1, 2.1
+  %div2 = fdiv double %add2, 2.2
+
+  %mul1 = fmul double %div1, 3.1
+  %mul2 = fmul double %div2, 3.2
+
+  ; These are external vectorizable stores with operands in reverse order.
+  %ptrA1 = getelementptr inbounds double, ptr %A, i64 1
+  store double %mul2, ptr %A
+  store double %mul1, ptr %ptrA1
+  br label %bb2
+
+bb2:
+  %addS1 = fadd double %mul1, 4.1
+  %addS2 = fadd double %mul2, 4.2
+  %seed = fcmp ogt double %addS1, %addS2
+  ret void
+}
+
+; This contains a sub/add bundle, reordering it will make it better.
+define void @subadd_and_external_users(ptr %A, ptr %ptr) {
+; CHECK-LABEL: @subadd_and_external_users(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    [[LD:%.*]] = load double, ptr undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], <double 2.200000e+00, double 2.100000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], <double 3.200000e+00, double 3.100000e+00>
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP5]], <double 4.200000e+00, double 4.100000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    ret void
+;
+bb1:
+  %ld = load double, ptr undef
+
+  %add1 = fadd double %ld, 1.1
+  %sub2 = fsub double %ld, 1.2
+
+  %div1 = fdiv double %add1, 2.1
+  %div2 = fdiv double %sub2, 2.2
+
+  %mul1 = fmul double %div1, 3.1
+  %mul2 = fmul double %div2, 3.2
+
+  ; These are external vectorizable stores with operands in reverse order.
+  %ptrA1 = getelementptr inbounds double, ptr %A, i64 1
+  store double %mul2, ptr %A
+  store double %mul1, ptr %ptrA1
+  br label %bb2
+
+bb2:
+  %addS1 = fadd double %mul1, 4.1
+  %addS2 = fadd double %mul2, 4.2
+  %seed = fcmp ogt double %addS1, %addS2
+  ret void
+}
+
 define void @alt_but_not_addsub_and_external_users(ptr %A, ptr %ptr) {
 ; CHECK-LABEL: @alt_but_not_addsub_and_external_users(
 ; CHECK-NEXT:  bb1:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
index a821362a883a11..fd3c1a57aff344 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
@@ -7,7 +7,8 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) {
 ; NON-POW2-NEXT:  entry:
 ; NON-POW2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
 ; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call <3 x float> @llvm.vector.insert.v3f32.v2f32(<3 x float> [[TMP2]], <2 x float> [[TMP1]], i64 0)
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
 ; NON-POW2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
 ; NON-POW2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
 ; NON-POW2-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
index 9719e60a6a695d..e1b091cc6fcda7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
@@ -8,18 +8,18 @@ define void @test(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[RESULT]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]