[llvm] [SLP]Initial support for non-power-of-2 vectorization (PR #151530)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 7 12:55:09 PDT 2025


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/151530

>From 6323547ceb986951e37992421440e1a90f2dc364 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 31 Jul 2025 14:56:33 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 450 +++++++++++-------
 .../AArch64/InstructionsState-is-invalid-0.ll |  24 +-
 .../SLPVectorizer/AArch64/PR38339.ll          |   8 +-
 .../alternate-vectorization-split-node.ll     |   6 +-
 .../AArch64/reused-scalar-repeated-in-node.ll |  30 +-
 .../AArch64/transpose-inseltpoison.ll         |  20 +-
 .../SLPVectorizer/AArch64/transpose.ll        |  20 +-
 .../SLPVectorizer/AArch64/tsc-s116.ll         |  11 +-
 .../vectorizable-selects-uniform-cmps.ll      |  25 +-
 .../RISCV/gather-insert-point-restore.ll      |   7 +-
 .../RISCV/reordered-buildvector-scalars.ll    | 206 +++-----
 .../RISCV/smin-signed-zextended.ll            |   6 +-
 .../RISCV/unordered-loads-operands.ll         |  44 +-
 .../SystemZ/reuse-non-power-of-2-reorder.ll   |  18 +-
 .../Transforms/SLPVectorizer/X86/PR39774.ll   |  16 +-
 .../Transforms/SLPVectorizer/X86/PR40310.ll   |  10 +-
 .../Transforms/SLPVectorizer/X86/bool-mask.ll |  16 +-
 .../SLPVectorizer/X86/bv-shuffle-mask.ll      |   4 +-
 llvm/test/Transforms/SLPVectorizer/X86/cse.ll |   9 +-
 .../X86/div-possibly-extended-with-poisons.ll |  32 +-
 .../X86/extractelement-multi-register-use.ll  |  14 +-
 .../extractelement-single-use-many-nodes.ll   |  13 +-
 .../X86/extractelements-vector-ops-shuffle.ll |   2 +-
 .../X86/extractelemets-extended-by-poison.ll  |  16 +-
 .../X86/full-match-with-poison-scalar.ll      |   8 +-
 .../X86/full-matched-bv-with-subvectors.ll    |  26 +-
 .../X86/gather-node-same-reduced.ll           |   6 +-
 .../X86/gathered-shuffle-resized.ll           |  12 +-
 .../X86/identity-match-splat-less-defined.ll  |  29 +-
 .../SLPVectorizer/X86/insert-subvector.ll     |  23 +-
 .../X86/load-merge-inseltpoison.ll            |  35 +-
 .../SLPVectorizer/X86/load-merge.ll           |  35 +-
 .../X86/load-partial-vector-shuffle.ll        |  27 +-
 .../SLPVectorizer/X86/malformed_phis.ll       |   5 +-
 .../X86/non-power-of-2-order-detection.ll     |   6 +-
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll |   2 -
 .../Transforms/SLPVectorizer/X86/pr47629.ll   |   2 -
 .../Transforms/SLPVectorizer/X86/pr47642.ll   |   7 +-
 .../Transforms/SLPVectorizer/X86/pr49081.ll   |   6 +-
 .../reduced-val-vectorized-in-transform.ll    |   2 +-
 .../X86/reorder-reused-masked-gather.ll       |   1 -
 .../X86/reorder-reused-masked-gather2.ll      |   1 -
 .../X86/reordered-masked-loads.ll             |  14 +-
 .../X86/replaced-external-in-reduction.ll     |   7 +-
 .../resized-bv-values-non-power-of2-node.ll   |  66 +--
 .../X86/reuse-extracts-in-wider-vect.ll       |   6 +-
 .../revec-reduced-value-vectorized-later.ll   |  15 +-
 .../X86/same-values-sub-node-with-poisons.ll  |   8 +-
 .../SLPVectorizer/X86/shrink_after_reorder.ll |   4 +-
 .../Transforms/SLPVectorizer/X86/sin-sqrt.ll  |   1 -
 .../X86/whole-registers-compare.ll            |   6 +-
 .../extract-many-users-buildvector.ll         |  47 +-
 ...hered-consecutive-loads-different-types.ll |   2 +-
 .../insertelement-across-zero.ll              |   2 +-
 .../reduction-whole-regs-loads.ll             |   9 +-
 .../SLPVectorizer/reorder-clustered-node.ll   |   8 +-
 .../SLPVectorizer/revec-shufflevector.ll      |  70 +--
 llvm/test/Transforms/SLPVectorizer/revec.ll   |  16 +-
 .../SLPVectorizer/shuffle-mask-resized.ll     |  47 +-
 59 files changed, 740 insertions(+), 828 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 593868fb8811a..e2d10b69fbb0d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1856,8 +1856,10 @@ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
   if (NumParts == 0 || NumParts >= Limit)
     return 1;
   unsigned Sz = getNumElements(VecTy);
-  if (NumParts >= Sz || Sz % NumParts != 0 ||
-      !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))
+  unsigned PWSz =
+      getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
+  if (NumParts >= Sz || PWSz % NumParts != 0 ||
+      !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
     return 1;
   return NumParts;
 }
@@ -1994,6 +1996,9 @@ class BoUpSLP {
         VectorizableTree.front()->getVectorFactor());
   }
 
+  /// Returns true if the tree is a reduction tree.
+  bool isReductionTree() const { return UserIgnoreList; }
+
   /// Builds external uses of the vectorized scalars, i.e. the list of
   /// vectorized scalars to be extracted, their lanes and their scalar users. \p
   /// ExternallyUsedValues contains additional list of external uses to handle
@@ -2185,6 +2190,21 @@ class BoUpSLP {
                                unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
+  /// Checks if the given array of vectorized values has the same node in the
+  /// tree.
+  bool hasSameNode(const InstructionsState &S, ArrayRef<Value *> VL) const {
+    if (S) {
+      if (any_of(getTreeEntries(S.getMainOp()),
+                 [&](const TreeEntry *TE) { return TE->isSame(VL); }))
+        return true;
+      return any_of(ValueToGatherNodes.lookup(S.getMainOp()),
+                    [&](const TreeEntry *TE) { return TE->isSame(VL); });
+    }
+    return any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+      return TE->isGather() && TE->isSame(VL);
+    });
+  }
+
   /// Registers non-vectorizable sequence of loads
   template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
     ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
@@ -3224,11 +3244,7 @@ class BoUpSLP {
               }))
             return false;
         }
-        // TODO: Check if we can remove a check for non-power-2 number of
-        // scalars after full support of non-power-2 vectorization.
-        return UniqueValues.size() != 2 &&
-               hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
-                                        UniqueValues.size());
+        return UniqueValues.size() != 2;
       };
 
       // If the initial strategy fails for any of the operand indexes, then we
@@ -3663,8 +3679,8 @@ class BoUpSLP {
   std::optional<TargetTransformInfo::ShuffleKind>
   isGatherShuffledSingleRegisterEntry(
       const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
-      bool ForOrder);
+      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
+      unsigned SliceSize);
 
   /// Checks if the gathered \p VL can be represented as multi-register
   /// shuffle(s) of previous tree entries.
@@ -4055,17 +4071,6 @@ class BoUpSLP {
       return IsNonPowerOf2;
     }
 
-    /// Return true if this is a node, which tries to vectorize number of
-    /// elements, forming whole vectors.
-    bool
-    hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
-      bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
-          TTI, getValueType(Scalars.front()), Scalars.size());
-      assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
-             "Reshuffling not supported with non-power-of-2 vectors yet.");
-      return IsNonPowerOf2;
-    }
-
     Value *getOrdered(unsigned Idx) const {
       assert(isGather() && "Must be used only for buildvectors/gathers.");
       if (ReorderIndices.empty())
@@ -4222,12 +4227,6 @@ class BoUpSLP {
     if (UserTreeIdx.UserTE)
       OperandsToTreeEntry.try_emplace(
           std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last);
-    // FIXME: Remove once support for ReuseShuffleIndices has been implemented
-    // for non-power-of-two vectors.
-    assert(
-        (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
-         ReuseShuffleIndices.empty()) &&
-        "Reshuffling scalars not yet supported for nodes with padding");
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
     if (ReorderIndices.empty()) {
@@ -4386,21 +4385,16 @@ class BoUpSLP {
   class ScalarsVectorizationLegality {
     InstructionsState S;
     bool IsLegal;
-    bool TryToFindDuplicates;
     bool TrySplitVectorize;
 
   public:
     ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
-                                 bool TryToFindDuplicates = true,
                                  bool TrySplitVectorize = false)
-        : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
-          TrySplitVectorize(TrySplitVectorize) {
-      assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
-             "Inconsistent state");
+        : S(S), IsLegal(IsLegal), TrySplitVectorize(TrySplitVectorize) {
+      assert((!IsLegal || S.valid()) && "Inconsistent state");
     }
     const InstructionsState &getInstructionsState() const { return S; };
     bool isLegal() const { return IsLegal; }
-    bool tryToFindDuplicates() const { return TryToFindDuplicates; }
     bool trySplitVectorize() const { return TrySplitVectorize; }
   };
 
@@ -5567,7 +5561,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
   auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
                                   ArrayRef<int> Mask, int PartSz, int NumParts,
                                   function_ref<unsigned(unsigned)> GetVF) {
-    for (int I : seq<int>(0, NumParts)) {
+    for (int I : seq<int>(NumParts)) {
       if (ShuffledSubMasks.test(I))
         continue;
       const int VF = GetVF(I);
@@ -5618,6 +5612,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
           SecondVecFound = true;
           break;
         }
+        if (static_cast<unsigned>(I * PartSz + Idx) >= CurrentOrder.size())
+          break;
         if (CurrentOrder[I * PartSz + Idx] >
                 static_cast<unsigned>(I * PartSz + K) &&
             CurrentOrder[I * PartSz + Idx] !=
@@ -5636,12 +5632,14 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
   if (!ExtractShuffles.empty())
     TransformMaskToOrder(
         CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
-          if (!ExtractShuffles[I])
+          if (I >= ExtractShuffles.size() || !ExtractShuffles[I])
             return 0U;
           unsigned VF = 0;
           unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
           for (unsigned Idx : seq<unsigned>(Sz)) {
             int K = I * PartSz + Idx;
+            if (static_cast<unsigned>(K) >= ExtractMask.size())
+              break;
             if (ExtractMask[K] == PoisonMaskElem)
               continue;
             if (!TE.ReuseShuffleIndices.empty())
@@ -5669,7 +5667,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
   }
   if (!Entries.empty())
     TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
-      if (!GatherShuffles[I])
+      if (I >= GatherShuffles.size() || !GatherShuffles[I])
         return 0U;
       return std::max(Entries[I].front()->getVectorFactor(),
                       Entries[I].back()->getVectorFactor());
@@ -6381,12 +6379,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     if (!TryRecursiveCheck || VL.size() < ListLimit)
       return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
 
-    // FIXME: The following code has not been updated for non-power-of-2
-    // vectors (and not whole registers).  The splitting logic here does not
-    // cover the original vector if the vector factor is not a power of two.
-    if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
-      return false;
-
     unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
     unsigned MinVF = getMinVF(2 * Sz);
     DemandedElts.clearAllBits();
@@ -6397,8 +6389,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
          VF >= MinVF;
          VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
       SmallVector<LoadsState> States;
-      for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
-        ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+      for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) {
+        ArrayRef<Value *> Slice = VL.slice(Cnt, std::min(VF, End - Cnt));
         SmallVector<unsigned> Order;
         SmallVector<Value *> PointerOps;
         LoadsState LS =
@@ -6410,7 +6402,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
             DemandedElts.setAllBits();
             break;
           }
-          DemandedElts.setBits(Cnt, Cnt + VF);
+          DemandedElts.setBits(Cnt, Cnt + Slice.size());
           continue;
         }
         // If need the reorder - consider as high-cost masked gather for now.
@@ -6436,13 +6428,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
             VecLdCost +=
                 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
       }
-      auto *SubVecTy = getWidenedType(ScalarTy, VF);
       for (auto [I, LS] : enumerate(States)) {
+        const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - I * VF);
+        auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
         auto *LI0 = cast<LoadInst>(VL[I * VF]);
         InstructionCost VectorGEPCost =
             (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
                 ? 0
-                : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+                : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, SliceVF),
                               LI0->getPointerOperand(),
                               Instruction::GetElementPtr, CostKind, ScalarTy,
                               SubVecTy)
@@ -6456,12 +6449,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                        getUnderlyingObject(PointerOps.front());
               }))
             VectorGEPCost += getScalarizationOverhead(
-                TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
+                TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF),
                 /*Insert=*/true, /*Extract=*/false, CostKind);
           else
             VectorGEPCost +=
                 getScalarizationOverhead(
-                    TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
+                    TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0),
                     /*Insert=*/true, /*Extract=*/false, CostKind) +
                 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
                                  CostKind);
@@ -6501,7 +6494,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
           continue;
         }
         SmallVector<int> ShuffleMask(VL.size());
-        for (int Idx : seq<int>(0, VL.size()))
+        for (int Idx : seq<int>(VL.size()))
           ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
         if (I > 0)
           VecLdCost +=
@@ -6740,10 +6733,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
   // No need to reorder if need to shuffle reuses, still need to shuffle the
   // node.
   if (!TE.ReuseShuffleIndices.empty()) {
-    // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
-    assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
-           "Reshuffling scalars not yet supported for nodes with padding");
-
     if (isSplat(TE.Scalars))
       return std::nullopt;
     // Check if reuse shuffle indices can be improved by reordering.
@@ -7082,12 +7071,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
           Res == LoadsState::CompressVectorize)
         return std::move(CurrentOrder);
     }
-    // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
-    // has been auditted for correctness with non-power-of-two vectors.
-    if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
-      if (std::optional<OrdersType> CurrentOrder =
-              findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
-        return CurrentOrder;
+    if (std::optional<OrdersType> CurrentOrder =
+            findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
+      return CurrentOrder;
   }
   return std::nullopt;
 }
@@ -7338,7 +7324,7 @@ void BoUpSLP::reorderTopToBottom() {
 
   // Reorder the graph nodes according to their vectorization factor.
   for (unsigned VF = VectorizableTree.front()->getVectorFactor();
-       !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
+       !VFToOrderedEntries.empty() && VF > 1; --VF) {
     auto It = VFToOrderedEntries.find(VF);
     if (It == VFToOrderedEntries.end())
       continue;
@@ -8530,17 +8516,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
           AllowToVectorize = CheckIfAllowed(Slice);
         } else {
           AllowToVectorize =
-              (NumElts >= 3 ||
-               any_of(ValueToGatherNodes.at(Slice.front()),
-                      [=](const TreeEntry *TE) {
-                        return TE->Scalars.size() == 2 &&
-                               ((TE->Scalars.front() == Slice.front() &&
-                                 TE->Scalars.back() == Slice.back()) ||
-                                (TE->Scalars.front() == Slice.back() &&
-                                 TE->Scalars.back() == Slice.front()));
-                      })) &&
-              hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
-                                       Slice.size());
+              NumElts >= 3 ||
+              any_of(ValueToGatherNodes.at(Slice.front()),
+                     [=](const TreeEntry *TE) {
+                       return TE->Scalars.size() == 2 &&
+                              ((TE->Scalars.front() == Slice.front() &&
+                                TE->Scalars.back() == Slice.back()) ||
+                               (TE->Scalars.front() == Slice.back() &&
+                                TE->Scalars.back() == Slice.front()));
+                     });
         }
         if (AllowToVectorize) {
           SmallVector<Value *> PointerOps;
@@ -9194,10 +9178,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     [[fallthrough]];
   case Instruction::ExtractValue: {
     bool Reuse = canReuseExtract(VL, CurrentOrder);
-    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
-    // non-full registers).
-    if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
-      return TreeEntry::NeedToGather;
     if (Reuse || !CurrentOrder.empty())
       return TreeEntry::Vectorize;
     LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -9705,7 +9685,7 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
                                 const TargetLibraryInfo &TLI,
                                 const InstructionsState &S,
                                 const BoUpSLP::EdgeInfo &UserTreeIdx,
-                                bool TryPad = false) {
+                                const BoUpSLP &R, bool BuildGatherOnly = true) {
   // Check that every instruction appears once in this bundle.
   SmallVector<Value *> UniqueValues;
   SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
@@ -9726,66 +9706,151 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
 
   // Easy case: VL has unique values and a "natural" size
   size_t NumUniqueScalarValues = UniqueValues.size();
-  bool IsFullVectors = hasFullVectorsOrPowerOf2(
-      TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
-  if (NumUniqueScalarValues == VL.size() &&
-      (VectorizeNonPowerOf2 || IsFullVectors)) {
+  if (NumUniqueScalarValues == VL.size()) {
     ReuseShuffleIndices.clear();
     return true;
   }
+  bool AreAllValuesNonConst = UniquePositions.size() == NumUniqueScalarValues;
+
+  // Check if we need to schedule the scalars. If no, can keep original scalars
+  // and avoid extra shuffles.
+  bool RequireScheduling = S && S.getOpcode() != Instruction::PHI &&
+                           !isVectorLikeInstWithConstOps(S.getMainOp()) &&
+                           (S.areInstructionsWithCopyableElements() ||
+                            !doesNotNeedToSchedule(UniqueValues));
+  // Drop tail poisons, if the values can be vectorized.
+  if (RequireScheduling) {
+    const auto EndIt =
+        find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()),
+                    IsaPred<PoisonValue>);
+    assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison.");
+    UniqueValues.erase(EndIt.base(), UniqueValues.end());
+    NumUniqueScalarValues = UniqueValues.size();
+  }
+
+  // Checks if unique inserts + shuffle is more profitable than just inserts or
+  // vectorized values.
+  auto EstimatePackPlusShuffleVsInserts = [&]() {
+    // Single instruction/argument insert - no shuffle.
+    if (UniquePositions.size() == 1 &&
+        (NumUniqueScalarValues == 1 ||
+         all_of(UniqueValues, IsaPred<UndefValue, Instruction, Argument>)))
+      return std::make_pair(false, false);
+    // Check if the given list of loads can be effectively vectorized.
+    auto CheckLoads = [&](ArrayRef<Value *> VL, bool IncludeGather) {
+      assert(S && S.getOpcode() == Instruction::Load && "Expected load.");
+      BoUpSLP::OrdersType Order;
+      SmallVector<Value *> PointerOps;
+      // Modified loads are gathered - use the original loads, result is the
+      // same, but cheaper, no shuffle.
+      BoUpSLP::LoadsState Res =
+          R.canVectorizeLoads(VL, S.getMainOp(), Order, PointerOps);
+      return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) ||
+             Res == BoUpSLP::LoadsState::ScatterVectorize;
+    };
+    // If the scalars are the operands of the root node - try to vectorize them
+    // with shuffles, otherwise we end up with the gather node, which may be
+    // non-profitable/small-tree for the vectorization.
+    if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 &&
+        !BuildGatherOnly) {
+      if (S && S.getOpcode() == Instruction::Load) {
+        // Modified loads are gathered - use the original loads, result is the
+        // same, but cheaper, no shuffle.
+        return std::make_pair(
+            true, CheckLoads(UniqueValues, /*IncludeGather=*/true) &&
+                      CheckLoads(VL, /*IncludeGather=*/false));
+      }
+      return std::make_pair(true, !RequireScheduling);
+    }
+    // Mark unique scalars, to be gathered/buildvectorized.
+    APInt DemandedElts = APInt::getZero(VL.size());
+    for_each(enumerate(ReuseShuffleIndices), [&](const auto &P) {
+      // Do not include constants.
+      if (P.value() != PoisonMaskElem &&
+          UniquePositions.contains(UniqueValues[P.value()]))
+        DemandedElts.setBit(P.index());
+    });
+    Type *ScalarTy = UniqueValues.front()->getType();
+    auto *VecTy = getWidenedType(ScalarTy, VL.size());
+    auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues);
+    // No need to schedule scalars and only single register used? Use original
+    // scalars, do not pack.
+    if (!RequireScheduling) {
+      const unsigned NumParts = ::getNumberOfParts(TTI, VecTy);
+      if (VL.size() / NumUniqueScalarValues == 1 &&
+          (NumParts <= 1 || ::getNumberOfParts(TTI, UniquesVecTy) >= NumParts))
+        return std::make_pair(true, true);
+    }
+    // Check if unique loads more profitable than repeated loads.
+    if (S && S.getOpcode() == Instruction::Load) {
+      bool UniquesVectorized =
+          !CheckLoads(UniqueValues, /*IncludeGather=*/true);
+      if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=*/false)) {
+        // Modified loads are gathered - use the original loads, result is the
+        // same, but cheaper, no shuffle.
+        return std::make_pair(true, !UniquesVectorized);
+      }
+    }
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    // Insert to poison, if no constants in scalars.
+    // If attempt to build vector node too, we can skip buildvector cost,
+    // because it will be the same for both unique and non-unique values.
+    InstructionCost InsertsCost =
+        ((!BuildGatherOnly && !RequireScheduling) || R.hasSameNode(S, VL))
+            ? InstructionCost(TTI::TCC_Free)
+            : ::getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+                                         /*Insert=*/true, /*Extract=*/false,
+                                         CostKind, AreAllValuesNonConst, VL);
+    APInt UniquesDemandedElts = APInt::getAllOnes(NumUniqueScalarValues);
+    for_each(seq<unsigned>(NumUniqueScalarValues), [&](unsigned Idx) {
+      // Do not include constants.
+      if (isConstant(UniqueValues[Idx]))
+        DemandedElts.clearBit(Idx);
+    });
+    InstructionCost UniquesCost =
+        (!BuildGatherOnly || R.hasSameNode(S, UniqueValues))
+            ? InstructionCost(TTI::TCC_Free)
+            : ::getScalarizationOverhead(TTI, ScalarTy, UniquesVecTy,
+                                         UniquesDemandedElts, /*Insert=*/true,
+                                         /*Extract=*/false, CostKind,
+                                         AreAllValuesNonConst, UniqueValues);
+    UniquesCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, VecTy,
+                                    ReuseShuffleIndices, CostKind, /*Index=*/0,
+                                    UniquesVecTy);
+    // If unique values are cheaper, use them.
+    if (UniquesCost <= InsertsCost)
+      return std::make_pair(true, false);
+    // If the cost difference is small, but the reduction cost may give some
+    // perf gain, prefer unique values, if we can fully vectorize it.
+    if (UniquesCost - InsertsCost < TTI::TCC_Expensive ||
+        (R.getTreeSize() == 0 && R.isReductionTree() &&
+         UniquesCost - InsertsCost == TTI::TCC_Expensive))
+      return std::make_pair(S && (!S.isAltShuffle() || !BuildGatherOnly),
+                            false);
+    // Otherwise, use original values, if values do not require scheduling and
+    // pass still try to vectorize them.
+    return std::make_pair(!BuildGatherOnly && !RequireScheduling,
+                          !BuildGatherOnly && !RequireScheduling);
+  };
 
-  // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
-  if ((UserTreeIdx.UserTE &&
-       UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
-      !hasFullVectorsOrPowerOf2(TTI, getValueType(VL.front()), VL.size())) {
-    LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
-                         "for nodes with padding.\n");
-    ReuseShuffleIndices.clear();
-    return false;
-  }
+  const auto [DoPack, UseOriginal] = EstimatePackPlusShuffleVsInserts();
 
-  LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
-  if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
-      (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
-         return isa<UndefValue>(V) || !isConstant(V);
-       }))) {
-    if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
-        S.getMainOp()->isSafeToRemove() &&
-        all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
-      // Find the number of elements, which forms full vectors.
-      unsigned PWSz = getFullVectorNumberOfElements(
-          TTI, UniqueValues.front()->getType(), UniqueValues.size());
-      PWSz = std::min<unsigned>(PWSz, VL.size());
-      if (PWSz == VL.size()) {
-        // We ended up with the same size after removing duplicates and
-        // upgrading the resulting vector size to a "nice size". Just keep
-        // the initial VL then.
-        ReuseShuffleIndices.clear();
-      } else {
-        // Pad unique values with poison to grow the vector to a "nice" size
-        SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
-                                                UniqueValues.end());
-        PaddedUniqueValues.append(
-            PWSz - UniqueValues.size(),
-            PoisonValue::get(UniqueValues.front()->getType()));
-        // Check that extended with poisons operations are still valid for
-        // vectorization (div/rem are not allowed).
-        if (!S.areInstructionsWithCopyableElements() &&
-            !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
-          LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-          ReuseShuffleIndices.clear();
-          return false;
-        }
-        VL = std::move(PaddedUniqueValues);
-      }
-      return true;
+  if (DoPack) {
+    if (UseOriginal) {
+      // Prefer original scalars - avoid shuffling.
+      ReuseShuffleIndices.clear();
+    } else {
+      // Better to use uniques + reshuffle.
+      LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+      VL = std::move(UniqueValues);
     }
-    LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-    ReuseShuffleIndices.clear();
-    return false;
+    return true;
   }
-  VL = std::move(UniqueValues);
-  return true;
+
+  // Buildvector/gather of the original scalars.
+  LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+  ReuseShuffleIndices.clear();
+  return false;
 }
 
 bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
@@ -9847,9 +9912,7 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
   SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
   SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
   if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
-      TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
-      !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) ||
-      !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size()))
+      TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
     return false;
   // Enable split node, only if all nodes are power-of-2/full registers.
   unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
@@ -9864,6 +9927,12 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
   }
   if (isIdentityOrder(ReorderIndices))
     ReorderIndices.clear();
+  else if (hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), VL.size()) &&
+           (!hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(),
+                                      Op1.size()) ||
+            !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(),
+                                      Op2.size())))
+    return false;
   SmallVector<int> Mask;
   if (!ReorderIndices.empty())
     inversePermutation(ReorderIndices, Mask);
@@ -10335,8 +10404,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
   if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
     LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
     // Do not try to pack to avoid extra instructions here.
-    return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
-                                        /*TryToFindDuplicates=*/false);
+    return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
   }
 
   // Check if this is a duplicate of another entry.
@@ -10385,8 +10453,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
   if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
     // Do not try to pack to avoid extra instructions here.
-    return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
-                                        /*TryToFindDuplicates=*/false);
+    return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
   }
 
   // If all of the operands are identical or constant we have a simple solution.
@@ -10480,7 +10547,6 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
                  interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
                  dbgs() << "]\n");
       return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
-                                          /*TryToFindDuplicates=*/true,
                                           /*TrySplitVectorize=*/true);
     }
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
@@ -10497,8 +10563,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
         LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
                           << ") is ephemeral.\n");
         // Do not try to pack to avoid extra instructions here.
-        return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
-                                            /*TryToFindDuplicates=*/false);
+        return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
       }
     }
   }
@@ -10666,9 +10731,8 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
       Legality = getScalarsVectorizationLegality(
           VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
     if (!Legality.isLegal()) {
-      if (Legality.tryToFindDuplicates())
-        tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
-                            UserTreeIdx);
+      (void)tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S,
+                                UserTreeIdx, *this);
 
       newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
       return;
@@ -10682,7 +10746,7 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
 
   // Check that every instruction appears once in this bundle.
   if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx,
-                           /*TryPad=*/true)) {
+                           *this, /*BuildGatherOnly=*/false)) {
     newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
     return;
   }
@@ -12111,8 +12175,11 @@ void BoUpSLP::transformNodes() {
           continue;
         SmallVector<std::pair<unsigned, unsigned>> Slices;
         bool AllStrided = true;
-        for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
-          ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+        for (unsigned Cnt = StartIdx; Cnt < End; Cnt += VF) {
+          const unsigned SliceVF = std::min(VF, End - Cnt);
+          if (SliceVF <= 1)
+            continue;
+          ArrayRef<Value *> Slice = VL.slice(Cnt, SliceVF);
           // If any instruction is vectorized already - do not try again.
           // Reuse the existing node, if it fully matches the slice.
           if (isVectorized(Slice.front()) &&
@@ -12132,16 +12199,14 @@ void BoUpSLP::transformNodes() {
           }
           if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
               count(Slice, Slice.front()) ==
-                  static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
+                  static_cast<long>(isa<UndefValue>(Slice.front()) ? SliceVF - 1
                                                                    : 1)) {
             if (IsSplat)
               continue;
             InstructionsState S = getSameOpcode(Slice, *TLI);
             if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) ||
                 (S.getOpcode() == Instruction::Load &&
-                 areKnownNonVectorizableLoads(Slice)) ||
-                (S.getOpcode() != Instruction::Load &&
-                 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
+                 areKnownNonVectorizableLoads(Slice)))
               continue;
             if (VF == 2) {
               // Try to vectorize reduced values or if all users are vectorized.
@@ -15080,7 +15145,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   for (ExternalUser &EU : ExternalUses) {
     LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
                       << EU.E.Idx << " in lane " << EU.Lane << "\n");
-    LLVM_DEBUG(dbgs() << "  User:" << *EU.User << "\n");
+    LLVM_DEBUG(if (EU.User) dbgs() << "  User:" << *EU.User << "\n";
+               else dbgs() << "  User: nullptr\n");
     LLVM_DEBUG(dbgs() << "  Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
 
     // Uses by ephemeral values are free (because the ephemeral value will be
@@ -15689,6 +15755,8 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
         tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
     ShufflesRes[Part] = Res;
     copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
+    if (SubVL.size() != SliceSize)
+      break;
   }
   if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
         return Res.has_value();
@@ -15700,7 +15768,8 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
 std::optional<TargetTransformInfo::ShuffleKind>
 BoUpSLP::isGatherShuffledSingleRegisterEntry(
     const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
+    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder,
+    unsigned SliceSize) {
   Entries.clear();
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
@@ -15997,8 +16066,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           (*It)->isSame(TE->Scalars)))) {
       Entries.push_back(*It);
       if ((*It)->getVectorFactor() == VL.size()) {
-        std::iota(std::next(Mask.begin(), Part * VL.size()),
-                  std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
+        std::iota(std::next(Mask.begin(), Part * SliceSize),
+                  std::next(Mask.begin(), Part * SliceSize + VL.size()), 0);
       } else {
         SmallVector<int> CommonMask = TE->getCommonMask();
         copy(CommonMask, Mask.begin());
@@ -16006,7 +16075,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       // Clear undef scalars.
       for (unsigned I : seq<unsigned>(VL.size()))
         if (isa<PoisonValue>(VL[I]))
-          Mask[Part * VL.size() + I] = PoisonMaskElem;
+          Mask[Part * SliceSize + I] = PoisonMaskElem;
       return TargetTransformInfo::SK_PermuteSingleSrc;
     }
     // No perfect match, just shuffle, so choose the first tree node from the
@@ -16158,8 +16227,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   Entries.swap(TempEntries);
   if (EntryLanes.size() == Entries.size() &&
       !VL.equals(ArrayRef(TE->Scalars)
-                     .slice(Part * VL.size(),
-                            std::min<int>(VL.size(), TE->Scalars.size())))) {
+                     .slice(Part * SliceSize, getNumElems(TE->Scalars.size(),
+                                                          SliceSize, Part)))) {
     // We may have here 1 or 2 entries only. If the number of scalars is equal
     // to the number of entries, no need to do the analysis, it is not very
     // profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -16172,7 +16241,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // Pair.first is the offset to the vector, while Pair.second is the index of
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
-    unsigned Idx = Part * VL.size() + Pair.second;
+    unsigned Idx = Part * SliceSize + Pair.second;
     Mask[Idx] =
         Pair.first * VF +
         (ForOrder ? std::distance(
@@ -16197,8 +16266,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   } else if (!isa<VectorType>(VL.front()->getType()) &&
              (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
     // Do the cost estimation if shuffle beneficial than buildvector.
-    SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
-                             std::next(Mask.begin(), (Part + 1) * VL.size()));
+    SmallVector<int> SubMask(
+        std::next(Mask.begin(), Part * SliceSize),
+        std::next(Mask.begin(), Part * SliceSize + VL.size()));
     int MinElement = SubMask.front(), MaxElement = SubMask.front();
     for (int Idx : SubMask) {
       if (Idx == PoisonMaskElem)
@@ -16211,16 +16281,23 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     assert(MaxElement >= 0 && MinElement >= 0 &&
            MaxElement % VF >= MinElement % VF &&
            "Expected at least single element.");
-    unsigned NewVF = std::max<unsigned>(
-        VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
-                                                 (MaxElement % VF) -
-                                                     (MinElement % VF) + 1));
+    unsigned Offset = 0;
+    unsigned MinIdx = MinElement % VF;
+    if (MinIdx > 1) {
+      unsigned MinVF = getFloorFullVectorNumberOfElements(
+          *TTI, VL.front()->getType(), MinIdx);
+      auto *VecTy = getWidenedType(VL.front()->getType(), MinVF);
+      unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, MinVF);
+      if (NumParts > 1)
+        Offset = MinVF;
+    }
+    unsigned NewVF =
+        std::max<unsigned>(VL.size(), (MaxElement % VF) - Offset + 1);
     if (NewVF < VF) {
       for (int &Idx : SubMask) {
         if (Idx == PoisonMaskElem)
           continue;
-        Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
-              (Idx >= static_cast<int>(VF) ? NewVF : 0);
+        Idx = (Idx % VF) - Offset + (Idx >= static_cast<int>(VF) ? NewVF : 0);
       }
     } else {
       NewVF = VF;
@@ -16300,8 +16377,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
         /*Extract=*/false, CostKind);
     const TreeEntry *BestEntry = nullptr;
     if (FirstShuffleCost < ShuffleCost) {
-      std::for_each(std::next(Mask.begin(), Part * VL.size()),
-                    std::next(Mask.begin(), (Part + 1) * VL.size()),
+      std::for_each(std::next(Mask.begin(), Part * SliceSize),
+                    std::next(Mask.begin(), Part * SliceSize + VL.size()),
                     [&](int &Idx) {
                       if (Idx >= static_cast<int>(VF))
                         Idx = PoisonMaskElem;
@@ -16310,8 +16387,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       ShuffleCost = FirstShuffleCost;
     }
     if (SecondShuffleCost < ShuffleCost) {
-      std::for_each(std::next(Mask.begin(), Part * VL.size()),
-                    std::next(Mask.begin(), (Part + 1) * VL.size()),
+      std::for_each(std::next(Mask.begin(), Part * SliceSize),
+                    std::next(Mask.begin(), Part * SliceSize + VL.size()),
                     [&](int &Idx) {
                       if (Idx < static_cast<int>(VF))
                         Idx = PoisonMaskElem;
@@ -16332,8 +16409,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   }
   Entries.clear();
   // Clear the corresponding mask elements.
-  std::fill(std::next(Mask.begin(), Part * VL.size()),
-            std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
+  std::fill(std::next(Mask.begin(), Part * SliceSize),
+            std::next(Mask.begin(), Part * SliceSize + VL.size()),
+            PoisonMaskElem);
   return std::nullopt;
 }
 
@@ -16353,15 +16431,11 @@ BoUpSLP::isGatherShuffledEntry(
                  return !TE->isGather();
                })))
     return {};
-  // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
-  // implemented yet.
-  if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
-    return {};
   Mask.assign(VL.size(), PoisonMaskElem);
   assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
          "Expected only single user of the gather node.");
-  assert(VL.size() % NumParts == 0 &&
-         "Number of scalars must be divisible by NumParts.");
+  unsigned PWSz =
+      getFullVectorNumberOfElements(*TTI, VL.front()->getType(), VL.size());
   if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
       TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
       (TE->Idx == 0 ||
@@ -16370,15 +16444,17 @@ BoUpSLP::isGatherShuffledEntry(
        (TE->hasState() &&
         getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars))))
     return {};
-  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+  unsigned SliceSize = getPartNumElems(PWSz, NumParts);
   SmallVector<std::optional<TTI::ShuffleKind>> Res;
   for (unsigned Part : seq<unsigned>(NumParts)) {
+    if (Part * SliceSize >= VL.size())
+      break;
     ArrayRef<Value *> SubVL =
         VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
     SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
     std::optional<TTI::ShuffleKind> SubRes =
         isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
-                                            ForOrder);
+                                            ForOrder, SliceSize);
     if (!SubRes)
       SubEntries.clear();
     Res.push_back(SubRes);
@@ -19713,6 +19789,20 @@ Value *BoUpSLP::vectorizeTree(
         SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
     }
   }
+  for (auto &TEPtr : VectorizableTree) {
+    TreeEntry *Entry = TEPtr.get();
+    if (VectorizableTree.front()->VectorizedValue != Entry->VectorizedValue &&
+        Entry->VectorizedValue &&
+        !Entry->VectorizedValue->getType()->isVoidTy() &&
+        (Entry->Idx != 0 || !Entry->hasState() ||
+         (Entry->getOpcode() != Instruction::InsertElement &&
+          Entry->getOpcode() != Instruction::PHI)) &&
+        Entry->VectorizedValue->use_empty()) {
+      if (auto *I = dyn_cast<Instruction>(Entry->VectorizedValue))
+        RemovedInsts.push_back(I);
+    }
+  }
+
   // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
   // cache correctness.
   // NOTE: removeInstructionAndOperands only marks the instruction for deletion
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
index 9c615bb4757fa..a1d3f250b8a83 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
@@ -7,18 +7,20 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @foo(ptr %0) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  vector.scevcheck:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> splat (i64 4)
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 4
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr null, i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[SCEVGEP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <4 x ptr> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP6]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x ptr> [[TMP12]], <8 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <8 x ptr> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP10]])
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[SCEVGEP3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <4 x ptr> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[RDX_OP:%.*]] = or <4 x i1> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_OP]])
 ; CHECK-NEXT:    br i1 [[OP_RDX]], label [[DOTLR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
index 6bea2554c6f96..28d3825c181d5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
@@ -3,9 +3,9 @@
 
 define void @f1(<2 x i16> %x, ptr %a) {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[X]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP1]], ptr [[A:%.*]], align 2
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], ptr undef, align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -29,9 +29,9 @@ define void @f2(<2 x i16> %x, ptr %a) {
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
 ; CHECK-NEXT:    [[AA:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], ptr undef, align 2
 ; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, ptr [[A]], align 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
@@ -70,9 +70,9 @@ define void @f3(<2 x i16> %x, ptr %a) {
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
 ; CHECK-NEXT:    [[AA:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP0]], ptr [[A]], align 2
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], ptr undef, align 2
 ; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, ptr [[A]], align 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
index 8d44d03e0e5cc..b83dda35abfdb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
@@ -10,9 +10,11 @@ define i32 @test(ptr %c) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
 ; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 295a71899c338..d472a53503c17 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -37,29 +37,31 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
 ; CHECK:       [[BB77]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x float> poison, float [[I70]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP12]], float [[I69]], i32 9
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x float> [[TMP22]], float [[I67]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 0, i32 13, i32 9, i32 9>
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[I66]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[I70]], i32 2
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP17]], float [[I68]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP23]], float [[I69]], i32 4
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[I67]], i32 6
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP20]], <16 x float> poison, <16 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 1, i32 poison, i32 3, i32 1, i32 3, i32 6, i32 3, i32 1, i32 poison, i32 4, i32 6, i32 6>
 ; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <16 x float> [ [[TMP15]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = phi <16 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison
 ; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
-; CHECK-NEXT:    [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <16 x i32> <i32 12, i32 7, i32 5, i32 6, i32 15, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP37]] = shufflevector <16 x float> [[TMP31]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <6 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP36]] = shufflevector <6 x float> [[TMP40]], <6 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
 ; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
 ; CHECK:       [[BB167]]:
 ; CHECK-NEXT:    [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index 0f47c6b3ac902..d0e0b8b3657b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -124,16 +124,16 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <3 x i32> <i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <3 x i32> <i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <3 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> [[TMP4]], <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <3 x i32> <i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <3 x i32> <i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <3 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <3 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <3 x i32> [[TMP8]], <3 x i32> [[TMP11]], <4 x i32> <i32 0, i32 0, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP10]]
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 370ed1f258aca..68a7ac119c056 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -124,16 +124,16 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <3 x i32> <i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <3 x i32> <i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <3 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> [[TMP4]], <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <3 x i32> <i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <3 x i32> <i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <3 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <3 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <3 x i32> [[TMP8]], <3 x i32> [[TMP11]], <4 x i32> <i32 0, i32 0, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP10]]
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index 92027d0043f76..3be08790bc2e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -17,12 +17,15 @@
 
 define void @s116_modified(ptr %a) {
 ; CHECK-LABEL: @s116_modified(
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 1
+; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 4
+; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> <i32 poison, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP3]], float [[LD1]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[GEP1]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index cbf8bc9dcf8f8..db411f2050ead 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -245,26 +245,17 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1
 ; CHECK-NEXT:    [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9
-; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11
-; CHECK-NEXT:    [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
-; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP4]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], splat (i8 -1)
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP13]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <7 x i8>, ptr [[GEP_9]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <7 x i8> [[TMP3]], <7 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP2]], <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt <15 x i8> [[TMP7]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <15 x i8> poison, i8 [[X]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <15 x i8> [[TMP9]], <15 x i8> poison, <15 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <15 x i1> [[TMP8]], <15 x i8> [[TMP7]], <15 x i8> [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <15 x i8> [[TMP11]], <15 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
 ; CHECK-NEXT:    store <16 x i8> [[TMP14]], ptr [[PTR]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
index 82c940353ba5a..72a349fe0c585 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
@@ -5,10 +5,9 @@ define i16 @test(ptr %i) {
 ; CHECK-LABEL: define i16 @test(
 ; CHECK-SAME: ptr [[I:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[I]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> <i64 132860, i64 137774>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP1]], <4 x i64> <i64 132860, i64 137774, i64 132860, i64 137774>
 ; CHECK-NEXT:    [[GEP_US154_2:%.*]] = getelementptr i8, ptr [[I]], i64 142688
 ; CHECK-NEXT:    br label %[[FOR_COND5_US:.*]]
 ; CHECK:       [[FOR_COND5_US]]:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index d4e323819402c..db1370efb8133 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -14,85 +14,38 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; CHECK-NEXT:    [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
 ; CHECK-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312:%.*]] = extractelement <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], i64 0
 ; CHECK-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; CHECK-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; CHECK-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; CHECK-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
-; CHECK-NEXT:    [[ADD2157_I:%.*]] = add i32 [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312]], 1
-; CHECK-NEXT:    [[SHR2158_I:%.*]] = lshr i32 [[ADD2157_I]], 1
-; CHECK-NEXT:    [[CONV2159_I:%.*]] = trunc i32 [[SHR2158_I]] to i16
-; CHECK-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
-; CHECK-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
-; CHECK-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
-; CHECK-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
-; CHECK-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
-; CHECK-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; CHECK-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
-; CHECK-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
-; CHECK-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
 ; CHECK-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
-; CHECK-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
-; CHECK-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; CHECK-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; CHECK-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; CHECK-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
-; CHECK-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; CHECK-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; CHECK-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; CHECK-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
-; CHECK-NEXT:    [[ADD2280_I:%.*]] = add i32 [[ADD111_I_I]], 1
-; CHECK-NEXT:    [[SHR2281_I:%.*]] = lshr i32 [[ADD2280_I]], 1
-; CHECK-NEXT:    [[CONV2282_I:%.*]] = trunc i32 [[SHR2281_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
-; CHECK-NEXT:    store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; CHECK-NEXT:    store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; CHECK-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; CHECK-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; CHECK-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; CHECK-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
 ; CHECK-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
-; CHECK-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
-; CHECK-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; CHECK-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; CHECK-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; CHECK-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
-; CHECK-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
-; CHECK-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
-; CHECK-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
-; CHECK-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; CHECK-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
-; CHECK-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
-; CHECK-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; CHECK-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; CHECK-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; CHECK-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; CHECK-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; CHECK-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; CHECK-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
-; CHECK-NEXT:    store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8230), align 2
-; CHECK-NEXT:    store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8210), align 2
-; CHECK-NEXT:    store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8190), align 2
-; CHECK-NEXT:    store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
-; CHECK-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
-; CHECK-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; CHECK-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
-; CHECK-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
-; CHECK-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
-; CHECK-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[ADD2323_I]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <3 x i32> [[TMP5]], <3 x i32> poison, <3 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or <3 x i32> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 poison, i32 poison, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x i32> [[TMP4]], <3 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 poison, i32 10, i32 3, i32 poison, i32 poison, i32 14, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[ADD2235_I16]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[MUL1445_I]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[SHR143_5_I_I9]], i32 7
+; CHECK-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <11 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <3 x i32> [[TMP7]], <3 x i32> poison, <11 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <11 x i32> [[TMP16]], <11 x i32> [[TMP17]], <11 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP19:%.*]] = lshr <11 x i32> [[TMP18]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[ADD1392_I]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP21]], splat (i32 1)
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP24:%.*]] = add <4 x i32> [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <11 x i32> [[TMP19]], <11 x i32> poison, <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <15 x i32> [[TMP25]], <15 x i32> [[TMP26]], <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 15, i32 16, i32 17, i32 18>
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc <15 x i32> [[TMP27]] to <15 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <15 x i16> [[TMP28]], <15 x i16> poison, <32 x i32> <i32 0, i32 8, i32 9, i32 1, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 0, i32 8, i32 9, i32 1, i32 11, i32 10, i32 12, i32 4, i32 5, i32 6, i32 0, i32 8, i32 9, i32 7, i32 13, i32 10, i32 12, i32 4, i32 5, i32 6, i32 0, i32 14>
+; CHECK-NEXT:    store <32 x i16> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
 ; CHECK-NEXT:    ret i32 0
 ;
 ; THRESH-LABEL: define fastcc i32 @test(
@@ -102,80 +55,37 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; THRESH-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; THRESH-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
-; THRESH-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
-; THRESH-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
-; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
-; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
-; THRESH-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
 ; THRESH-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
-; THRESH-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; THRESH-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; THRESH-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
-; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
-; THRESH-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
 ; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
-; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; THRESH-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
-; THRESH-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
-; THRESH-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; THRESH-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
-; THRESH-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
-; THRESH-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
-; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
-; THRESH-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
-; THRESH-NEXT:    [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
-; THRESH-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
-; THRESH-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; THRESH-NEXT:    store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
-; THRESH-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT:    [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; THRESH-NEXT:    [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[ADD2323_I]], i32 2
+; THRESH-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> poison, i32 [[TMP0]], i32 0
+; THRESH-NEXT:    [[TMP6:%.*]] = shufflevector <3 x i32> [[TMP5]], <3 x i32> poison, <3 x i32> zeroinitializer
+; THRESH-NEXT:    [[TMP7:%.*]] = or <3 x i32> [[TMP4]], [[TMP6]]
+; THRESH-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 poison, i32 poison, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP9:%.*]] = shufflevector <3 x i32> [[TMP4]], <3 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 0, i32 poison>
+; THRESH-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 poison, i32 10, i32 3, i32 poison, i32 poison, i32 14, i32 poison>
+; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP0]], i32 4
+; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[ADD2235_I16]], i32 1
+; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[MUL1445_I]], i32 5
+; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[SHR143_5_I_I9]], i32 7
+; THRESH-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 1, i32 1>
+; THRESH-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <11 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP17:%.*]] = shufflevector <3 x i32> [[TMP7]], <3 x i32> poison, <11 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP18:%.*]] = shufflevector <11 x i32> [[TMP16]], <11 x i32> [[TMP17]], <11 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 12, i32 13>
+; THRESH-NEXT:    [[TMP19:%.*]] = lshr <11 x i32> [[TMP18]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 1, i32 1, i32 1, i32 1, i32 1>
+; THRESH-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> <i32 2, i32 poison, i32 4, i32 7>
+; THRESH-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[ADD1392_I]], i32 1
+; THRESH-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP21]], splat (i32 1)
+; THRESH-NEXT:    [[TMP23:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+; THRESH-NEXT:    [[TMP24:%.*]] = add <4 x i32> [[TMP22]], [[TMP23]]
+; THRESH-NEXT:    [[TMP25:%.*]] = shufflevector <11 x i32> [[TMP19]], <11 x i32> poison, <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP27:%.*]] = shufflevector <15 x i32> [[TMP25]], <15 x i32> [[TMP26]], <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 15, i32 16, i32 17, i32 18>
+; THRESH-NEXT:    [[TMP28:%.*]] = trunc <15 x i32> [[TMP27]] to <15 x i16>
+; THRESH-NEXT:    [[TMP29:%.*]] = shufflevector <15 x i16> [[TMP28]], <15 x i16> poison, <32 x i32> <i32 0, i32 8, i32 9, i32 1, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 0, i32 8, i32 9, i32 1, i32 11, i32 10, i32 12, i32 4, i32 5, i32 6, i32 0, i32 8, i32 9, i32 7, i32 13, i32 10, i32 12, i32 4, i32 5, i32 6, i32 0, i32 14>
+; THRESH-NEXT:    store <32 x i16> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
 ; THRESH-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll
index 8db3a8b6ff219..8b409851d1eb1 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll
@@ -5,9 +5,9 @@ define <4 x i32> @test(i16 %0, i16 %1) {
 ; CHECK-LABEL: define <4 x i32> @test(
 ; CHECK-SAME: i16 [[TMP0:%.*]], i16 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> <i16 poison, i16 0>, i16 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> <i16 poison, i16 poison, i16 poison, i16 0>, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[CONV15_I:%.*]] = sext i16 [[TMP0]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[CONV15_I]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
index 510cf45edbb52..01a279890ea24 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
@@ -8,32 +8,30 @@ define void @test(ptr %mdct_forward_x) {
 ; CHECK-NEXT:    br label %[[FOR_COND:.*]]
 ; CHECK:       [[FOR_COND]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[MDCT_FORWARD_X]], align 8
-; CHECK-NEXT:    [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
-; CHECK-NEXT:    [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
 ; CHECK-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> <i64 28, i64 36, i64 24, i64 28>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
+; CHECK-NEXT:    [[ARRAYIDX10_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 28
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x ptr> poison, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x ptr> [[TMP1]], <3 x ptr> poison, <3 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <3 x ptr> [[TMP2]], <3 x i64> <i64 32, i64 36, i64 48>
+; CHECK-NEXT:    [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> <float poison, float poison, float 0.000000e+00, float poison>, <4 x float> [[TMP22]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = fsub <4 x float> zeroinitializer, [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <4 x float> zeroinitializer, [[TMP8]]
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[ARRAYIDX10_I_I]], i64 -4, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP20]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <3 x float> <float 0.000000e+00, float poison, float 0.000000e+00>, <3 x float> [[TMP6]], <3 x i32> <i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub <3 x float> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <3 x float> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <3 x float> [[TMP9]], <3 x float> [[TMP10]], <3 x i32> <i32 0, i32 1, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = call <3 x float> @llvm.masked.gather.v3f32.v3p0(<3 x ptr> [[TMP3]], i32 4, <3 x i1> splat (i1 true), <3 x float> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> <i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <3 x float> <float poison, float 0.000000e+00, float 0.000000e+00>, <3 x float> [[TMP6]], <3 x i32> <i32 5, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP15:%.*]] = fsub <3 x float> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fadd <3 x float> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <3 x float> [[TMP15]], <3 x float> [[TMP16]], <3 x i32> <i32 0, i32 1, i32 5>
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[ADD_PTR_I]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = fsub <4 x float> [[TMP15]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd <4 x float> [[TMP15]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = fsub <3 x float> [[TMP17]], [[TMP11]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <3 x float> [[TMP17]], [[TMP11]]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <3 x float> [[TMP18]], <3 x float> [[TMP19]], <4 x i32> <i32 0, i32 4, i32 2, i32 0>
 ; CHECK-NEXT:    store <4 x float> [[TMP21]], ptr [[ARRAYIDX2_I_I]], align 4
 ; CHECK-NEXT:    br label %[[FOR_COND]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll
index dbeff25954085..64de6033a4875 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll
@@ -4,23 +4,23 @@
 define void @test(i32 %0, i64 %1, i32 %2, i32 %3, ptr %4) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], ptr [[TMP4:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP56]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP73:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP73]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP98:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <60 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP0]], i32 7
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <60 x i32> [[TMP11]], <60 x i32> poison, <60 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <60 x i32> poison, i32 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <60 x i32> [[TMP13]], i32 [[TMP2]], i32 7
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <60 x i32> [[TMP14]], i32 [[TMP98]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <60 x i32> [[TMP15]], i32 [[TMP73]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <60 x i32> [[TMP14]], i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <60 x i32> [[TMP15]], i32 [[TMP98]], i32 6
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <60 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 60, i32 61, i32 poison, i32 poison, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP19]], <8 x i32> poison, <60 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <60 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 60, i32 61, i32 poison, i32 poison, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x i32> [[TMP22]], <8 x i32> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> poison, <60 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor <60 x i32> [[TMP12]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP130:%.*]] = call i32 @llvm.vector.reduce.or.v60i32(<60 x i32> [[TMP21]])
 ; CHECK-NEXT:    store i32 [[TMP130]], ptr [[TMP4]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 143e09374a891..c083e9e551441 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -8,15 +8,15 @@ define void @Test(i32) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <8 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
 ; CHECK-NEXT:    [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
-; CHECK-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> poison, i32 [[OP_RDX1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VAL_43]], i32 1
+; CHECK-NEXT:    [[TMP6]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
@@ -24,15 +24,15 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
 ; FORCE_REDUCTION:       loop:
 ; FORCE_REDUCTION-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
-; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = phi <8 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
 ; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
 ; FORCE_REDUCTION-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
 ; FORCE_REDUCTION-NEXT:    [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> poison, i32 [[OP_RDX1]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VAL_43]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP6]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index 194c7021f60f5..a4c1d1bd98f8c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -4,20 +4,20 @@
 define void @mainTest(i32 %param, ptr %vals, i32 %len) {
 ; CHECK-LABEL: @mainTest(
 ; CHECK-NEXT:  bci_15.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 31>, i32 [[PARAM:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> <i32 poison, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>, i32 [[PARAM:%.*]], i32 0
 ; CHECK-NEXT:    br label [[BCI_15:%.*]]
 ; CHECK:       bci_15:
 ; CHECK-NEXT:    [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM]], [[BCI_15_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <16 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]]
 ; CHECK-NEXT:    [[V44]] = add i32 [[LOCAL_4_]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0
-; CHECK-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[V44]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> poison, i32 [[OP_RDX]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[V44]], i32 1
+; CHECK-NEXT:    [[TMP6]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
index e1ee35217d187..2ac19ffc564d3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll
@@ -21,13 +21,9 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
 ; SSE-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
 ; SSE-NEXT:    [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
 ; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
-; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
-; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
-; SSE-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
-; SSE-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
-; SSE-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
+; SSE-NEXT:    [[TMP1:%.*]] = load <12 x i8>, ptr [[ARRAYIDX_1]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = icmp eq <12 x i8> [[TMP1]], zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = select <12 x i1> [[TMP2]], <12 x i64> zeroinitializer, <12 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096>
 ; SSE-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
 ; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
 ; SSE-NEXT:    [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
@@ -40,11 +36,7 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
 ; SSE-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
 ; SSE-NEXT:    [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
 ; SSE-NEXT:    [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]]
-; SSE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP12]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]])
+; SSE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v12i64(<12 x i64> [[TMP3]])
 ; SSE-NEXT:    [[OP_RDX:%.*]] = or i64 [[TMP16]], [[OR_13]]
 ; SSE-NEXT:    [[OP_RDX5:%.*]] = or i64 [[OR_14]], [[OR_15]]
 ; SSE-NEXT:    [[OP_RDX6:%.*]] = or i64 [[OP_RDX]], [[OP_RDX5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
index c4ddc5d63cc04..8a9ea6886bda1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll
@@ -8,9 +8,7 @@ define i16 @test(i16 %v1, i16 %v2) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[V2]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[V1]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 poison, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP0]], <4 x i32> <i32 3, i32 3, i32 3, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i16> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i16> [[TMP9]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
index 7510b8fb83e34..83c0c028a2170 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -16,8 +16,9 @@ define i32 @test(ptr nocapture %G) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[G:%.*]], i64 5
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00, double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <3 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x double> [[TMP4]], <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00, double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[G]], align 8
 ; CHECK-NEXT:    ret i32 undef
@@ -283,7 +284,7 @@ return:                                           ; preds = %entry, %if.end
 define void @PR19646(ptr %this, i1 %arg) {
 ; CHECK-LABEL: @PR19646(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[IF_END13:%.*]], label [[IF_END13]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_END13:%.*]], label [[IF_END13]]
 ; CHECK:       sw.epilog7:
 ; CHECK-NEXT:    [[DOTIN:%.*]] = getelementptr inbounds [[CLASS_B_53_55:%.*]], ptr [[THIS:%.*]], i64 0, i32 0, i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[DOTIN]], align 8
@@ -294,7 +295,7 @@ define void @PR19646(ptr %this, i1 %arg) {
 ; CHECK-NEXT:    [[_DY:%.*]] = getelementptr inbounds [[CLASS_B_53_55]], ptr [[THIS]], i64 0, i32 0, i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[_DY]], align 8
 ; CHECK-NEXT:    [[ADD10:%.*]] = fadd double [[ADD8]], [[TMP2]]
-; CHECK-NEXT:    br i1 %arg, label [[IF_THEN12:%.*]], label [[IF_END13]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[IF_THEN12:%.*]], label [[IF_END13]]
 ; CHECK:       if.then12:
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr undef, align 8
 ; CHECK-NEXT:    br label [[IF_END13]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll
index 07ee8f840721f..8df459a9a22b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll
@@ -11,20 +11,24 @@ define i8 @test(ptr %g_127, i32 %0, i16 %1) {
 ; CHECK:       [[FOR_INC434_I]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 60, %[[FOR_COND166_PREHEADER_I]] ]
 ; CHECK-NEXT:    [[CONV8_I_I:%.*]] = zext nneg i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[DIV_I_I_1:%.*]] = udiv i64 [[CONV8_I_I]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[DIV_I_I_1]] to i16
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i64> poison, i64 [[CONV8_I_I]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = udiv <4 x i64> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 2, i32 poison, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 poison, i32 10, i32 11, i32 poison, i32 13, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 3, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP13]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[TMP14]])
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[CONV8_I_I]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x i64> [[TMP3]], <3 x i64> poison, <3 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <3 x i64> [[TMP5]], <3 x i64> poison, <3 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <3 x i64> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <3 x i64> [[TMP7]] to <3 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x i16> [[TMP8]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[CONV8_I_I]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = udiv <2 x i64> [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <2 x i64> [[TMP19]] to <2 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> [[TMP17]])
+; CHECK-NEXT:    [[RDX_OP:%.*]] = and <4 x i16> [[TMP10]], [[TMP18]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[RDX_OP]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = and i16 [[TMP15]], [[TMP1]]
 ; CHECK-NEXT:    [[AND14_I_2_I_5:%.*]] = zext i16 [[OP_RDX]] to i32
 ; CHECK-NEXT:    store i32 [[AND14_I_2_I_5]], ptr [[G_127]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
index 73b73735da021..75f38c5d0807c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll
@@ -11,14 +11,12 @@ define void @test(double %i) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[I75:%.*]] = fsub double 0.000000e+00, [[I]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> <i32 poison, i32 0, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> <i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 5, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP28]], <8 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 poison, i32 12, i32 5, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[I75]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP1]], <6 x i32> <i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP10]], <6 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison, double 0.000000e+00, double poison>, <6 x i32> <i32 6, i32 7, i32 2, i32 poison, i32 10, i32 5>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x double> [[TMP6]], double [[I75]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <6 x double> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x double> [[TMP9]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp ult <8 x double> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    br label [[BB116:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 6942df532ae29..9655cc3d89d3c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -9,12 +9,13 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[I82:%.*]] = fsub double 0.000000e+00, poison
 ; CHECK-NEXT:    [[I103:%.*]] = fsub double 0.000000e+00, [[I]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 1, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <8 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <5 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <5 x double> [[TMP2]], <5 x double> <double 0.000000e+00, double poison, double poison, double poison, double 0.000000e+00>, <5 x i32> <i32 5, i32 1, i32 poison, i32 3, i32 9>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x double> [[TMP3]], double [[I82]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <5 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x double> [[TMP5]], <5 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <6 x double> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = freeze <8 x i1> [[TMP15]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
index 7bbc694dc5181..51ee894894081 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll
@@ -10,7 +10,7 @@ define double @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, double [[TMP3]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul double [[TMP6]], 0.000000e+00
 ; CHECK-NEXT:    store double [[TMP7]], ptr null, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
index 18e03df0fbcc9..132865da252c9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll
@@ -7,14 +7,14 @@ define i32 @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 poison, i64 poison, i64 poison>, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 1, i32 2, i32 2, i32 3, i32 3, i32 3, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP3]], <6 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 0, i64 poison>, <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 10, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x i64> [[TMP4]], <6 x i64> [[TMP3]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <6 x i64> [[TMP5]] to <6 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <6 x i32> [[TMP6]], <6 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <6 x i32> <i32 1, i32 2, i32 3, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <6 x i64> [[TMP10]] to <6 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <6 x i32> [[TMP11]], <6 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 2, i32 2, i32 5, i32 4, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <8 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
index 5e3d4715e99c5..7455fedf0762c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll
@@ -7,10 +7,10 @@ define i32 @test() {
 ; CHECK-NEXT:    br label %[[FUNC_135_EXIT_I:.*]]
 ; CHECK:       [[FUNC_135_EXIT_I]]:
 ; CHECK-NEXT:    [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison>, [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <5 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, i32 [[G_228_PROMOTED166_I1105_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <5 x i32> [[TMP0]], <5 x i32> poison, <5 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <5 x i32> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <5 x i32> [[TMP2]], <5 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
index ce65f532e0b3b..e1868a26b4127 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll
@@ -6,42 +6,38 @@ define i32 @test(i64 %l.549) {
 ; CHECK-SAME: i64 [[L_549:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CONV3:%.*]] = sext i32 0 to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[L_549]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[IF_THEN19:.*]]
 ; CHECK:       [[P:.*]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP14:%.*]], %[[IF_END25:.*]] ]
 ; CHECK-NEXT:    br i1 false, label %[[S:.*]], label %[[Q:.*]]
 ; CHECK:       [[Q]]:
 ; CHECK-NEXT:    [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 3
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP18]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]]
 ; CHECK:       [[LOR_LHS_FALSE]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]]
 ; CHECK:       [[R]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i64> [ [[TMP19]], %[[Q]] ], [ [[TMP20:%.*]], %[[IF_THEN19]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i64> [ [[TMP19]], %[[Q]] ], [ [[TMP20:%.*]], %[[IF_THEN19]] ]
 ; CHECK-NEXT:    br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]]
 ; CHECK:       [[LAND_LHS_TRUE]]:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i64> [ [[TMP21]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i64> [ [[TMP17]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ]
 ; CHECK-NEXT:    br i1 false, label %[[Q]], label %[[S]]
 ; CHECK:       [[S]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP22]], %[[LAND_LHS_TRUE]] ], [ [[TMP21]], %[[R]] ], [ [[TMP19]], %[[LOR_LHS_FALSE]] ], [ [[TMP17]], %[[P]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP21]], %[[LAND_LHS_TRUE]] ], [ [[TMP17]], %[[R]] ], [ [[TMP12]], %[[LOR_LHS_FALSE]] ], [ [[TMP3]], %[[P]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[IF_THEN19]]
 ; CHECK:       [[IF_THEN19]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11]], %[[S]] ]
-; CHECK-NEXT:    [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11]], %[[S]] ]
+; CHECK-NEXT:    [[TMP14]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP20]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 false, label %[[R]], label %[[IF_END25]]
 ; CHECK:       [[IF_END25]]:
 ; CHECK-NEXT:    br i1 false, label %[[IF_END29]], label %[[P]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll
index 42ed26d82e036..e74b1f09114eb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll
@@ -5,7 +5,8 @@ define i64 @test(ptr %p) {
 ; CHECK-LABEL: define i64 @test(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 12
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <3 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP13]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer
@@ -87,7 +88,8 @@ define i64 @test1(ptr %p) {
 ; CHECK-LABEL: define i64 @test1(
 ; CHECK-SAME: ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 12
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <3 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP13]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll
index e42e6183b8cae..889ed393e3f45 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll
@@ -17,10 +17,14 @@ define ptr @test(ptr %0, ptr %args_gep) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ARG1]], i64 12
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <7 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <9 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <9 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <9 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <9 x i32> [[TMP11]], <9 x i32> [[TMP18]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <7 x i32> [[TMP10]], <7 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <9 x i32> [[TMP19]], <9 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP20]], <16 x i32> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24>
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <16 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i1> [[TMP14]] to <16 x i8>
 ; CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
index f07424f0d2934..14fdd17753805 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
@@ -3,29 +3,18 @@
 
 define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 0, 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <24 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 3, i32 3, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 7, i32 7, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i32> [[TMP9]], <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i32> [[TMP10]], <64 x i32> [[TMP12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP2]], i32 7
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <64 x i32> [[TMP8]], <64 x i32> poison, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 67, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <64 x i32> [[TMP8]], <64 x i32> [[TMP15]], <23 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <23 x i32> [[TMP9]], <23 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <64 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP22:%.*]] = zext <64 x i1> [[TMP21]] to <64 x i8>
 ; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> [[TMP22]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
index 0fddb7322e9b3..989da443dd7fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll
@@ -11,21 +11,16 @@ define <16 x double> @test(ptr %x, double %v, double %a) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[GEP8]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> poison, double [[A]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x double> [[TMP4]], <16 x double> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[V]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[V]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x double> [[TMP10]], <16 x double> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x double> [[TMP12]], <16 x double> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <6 x double> poison, double [[V]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP9]], <6 x double> poison, <6 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x double> [[TMP16]], <6 x double> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x double> [[TMP14]], <16 x double> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x double> [[TMP16]], <16 x double> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <16 x double> [[TMP21]], <16 x double> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x double> [[TMP19]], <16 x double> [[TMP20]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP16]], <6 x double> [[TMP10]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x double> [[TMP11]], <16 x double> [[TMP12]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 poison, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x double> [[TMP13]], <16 x double> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fadd <16 x double> [[TMP5]], [[TMP17]]
 ; CHECK-NEXT:    ret <16 x double> [[TMP18]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 4f94784a24dd4..86d20eb20cf2f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s
 
 ;unsigned load_le32(unsigned char *data) {
 ;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
@@ -51,24 +51,10 @@ entry:
 }
 
 define <4 x float> @PR16739_byref(ptr nocapture readonly dereferenceable(16) %x) {
-; AVX2-LABEL: @PR16739_byref(
-; AVX2-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2
-; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4
-; AVX2-NEXT:    [[X2:%.*]] = load float, ptr [[GEP2]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2
-; AVX2-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
-; AVX2-NEXT:    ret <4 x float> [[I3]]
-;
-; AVX512-LABEL: @PR16739_byref(
-; AVX512-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 1
-; AVX512-NEXT:    [[X0:%.*]] = load float, ptr [[X]], align 4
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; AVX512-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX512-NEXT:    [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; AVX512-NEXT:    ret <4 x float> [[TMP3]]
+; CHECK-LABEL: @PR16739_byref(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %gep1 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 1
   %gep2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
@@ -103,10 +89,11 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
 ; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <3 x i32> <i32 0, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> [[TMP2]], i64 [[T8]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <3 x i64> [[TMP3]] to <3 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <3 x i32> [[TMP4]] to <3 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %t1 = load i64, ptr %x, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 700e3ed9effc4..7efd1b16e6862 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s
 
 ;unsigned load_le32(unsigned char *data) {
 ;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
@@ -51,24 +51,10 @@ entry:
 }
 
 define <4 x float> @PR16739_byref(ptr nocapture readonly dereferenceable(16) %x) {
-; AVX2-LABEL: @PR16739_byref(
-; AVX2-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2
-; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4
-; AVX2-NEXT:    [[X2:%.*]] = load float, ptr [[GEP2]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2
-; AVX2-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
-; AVX2-NEXT:    ret <4 x float> [[I3]]
-;
-; AVX512-LABEL: @PR16739_byref(
-; AVX512-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 1
-; AVX512-NEXT:    [[X0:%.*]] = load float, ptr [[X]], align 4
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; AVX512-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX512-NEXT:    [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; AVX512-NEXT:    ret <4 x float> [[TMP3]]
+; CHECK-LABEL: @PR16739_byref(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %gep1 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 1
   %gep2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
@@ -103,10 +89,11 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
 ; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <3 x i32> <i32 0, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> [[TMP2]], i64 [[T8]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <3 x i64> [[TMP3]] to <3 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <3 x i32> [[TMP4]] to <3 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
 ; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %t1 = load i64, ptr %x, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll
index f18a72b0bf776..10588da5ef663 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll
@@ -7,33 +7,14 @@
 
 define <2 x i64> @load_00123456(ptr nocapture noundef readonly %data) {
 ; SSE-LABEL: @load_00123456(
-; SSE-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 1
-; SSE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 2
-; SSE-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3
-; SSE-NEXT:    [[T0:%.*]] = load i16, ptr [[DATA]], align 2
-; SSE-NEXT:    [[T1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
-; SSE-NEXT:    [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX3]], align 2
-; SSE-NEXT:    [[VECINIT0_I_I:%.*]] = insertelement <8 x i16> undef, i16 [[T0]], i64 0
-; SSE-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i16> [[VECINIT0_I_I]], i16 [[T0]], i64 1
-; SSE-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I_I]], i16 [[T1]], i64 2
-; SSE-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I]], i16 [[T2]], i64 3
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[VECINIT7_I_I:%.*]] = shufflevector <8 x i16> [[VECINIT3_I_I]], <8 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP1:%.*]] = load <7 x i16>, ptr [[DATA:%.*]], align 2
+; SSE-NEXT:    [[VECINIT7_I_I:%.*]] = shufflevector <7 x i16> [[TMP1]], <7 x i16> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
 ; SSE-NEXT:    [[T7:%.*]] = bitcast <8 x i16> [[VECINIT7_I_I]] to <2 x i64>
 ; SSE-NEXT:    ret <2 x i64> [[T7]]
 ;
 ; AVX-LABEL: @load_00123456(
-; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 2
-; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3
-; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[DATA]], align 2
-; AVX-NEXT:    [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX3]], align 2
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINIT2_I_I2:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I2]], i16 [[T2]], i64 3
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINIT7_I_I1:%.*]] = shufflevector <8 x i16> [[VECINIT3_I_I]], <8 x i16> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    [[TMP1:%.*]] = load <7 x i16>, ptr [[DATA:%.*]], align 2
+; AVX-NEXT:    [[VECINIT7_I_I1:%.*]] = shufflevector <7 x i16> [[TMP1]], <7 x i16> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
 ; AVX-NEXT:    [[T7:%.*]] = bitcast <8 x i16> [[VECINIT7_I_I1]] to <2 x i64>
 ; AVX-NEXT:    ret <2 x i64> [[T7]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
index 842bd6c6bec37..9db0320835bf3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll
@@ -103,9 +103,8 @@ define i64 @test_3() #0 {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ splat (i32 3), [[BB1]] ], [ poison, [[BB2:%.*]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[VAL4:%.*]] = extractelement <28 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <28 x i32> [ splat (i32 3), [[BB1]] ], [ poison, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[VAL4:%.*]] = extractelement <28 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <32 x i32> poison, i32 [[VAL4]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
index 04359eb6fcd7c..9f3a45b7a15b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll
@@ -16,10 +16,10 @@ define void @e(ptr %c, i64 %0) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <6 x ptr> [[TMP7]], ptr [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <6 x ptr> [[TMP8]], <6 x ptr> [[TMP19]], <6 x i32> <i32 6, i32 7, i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x ptr> [[TMP6]], <2 x ptr> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <6 x ptr> [[TMP20]], <6 x ptr> [[TMP21]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <6 x ptr> [[TMP8]], <6 x ptr> [[TMP21]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <6 x ptr> [[TMP19]], <6 x ptr> [[TMP20]], <6 x i32> <i32 6, i32 7, i32 2, i32 3, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5>
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index db38a62017391..ad920013ad013 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -643,7 +643,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
@@ -653,7 +652,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index bfa3610804967..d78d68820d4d5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -643,7 +643,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
@@ -653,7 +652,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
index 782aada17acac..e0d1f1b0ff019 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -23,10 +23,9 @@ define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
 
 define <4 x i32> @bar(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @bar(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
   %add = add nsw i32 %f, 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
index bada001ebbc6c..b8bc64d3bb728 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
@@ -5,9 +5,9 @@
 
 define dso_local <4 x float> @foo(<4 x i32> %0) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> poison, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <3 x i32> [[TMP2]] to <3 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
   %2 = extractelement <4 x i32> %0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
index 2612a21b9eedf..b8a3f9de76039 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
@@ -13,7 +13,7 @@ define i32 @test(i1 %cond) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, <4 x i32> <i32 poison, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[P1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[OR92]] = or i32 1, 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
index 7bb436b9543bf..55a22731aaf9f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -5,7 +5,6 @@ define void @test(ptr noalias %0, ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 16, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 15, i32 4, i32 5, i32 0, i32 2, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 15, i32 4, i32 5, i32 15, i32 4, i32 5, i32 15, i32 0, i32 5, i32 2, i32 6, i32 7, i32 8, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 4, i32 24, i32 15, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 63dbf3ce78c32..72602ee2a74ed 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -9,7 +9,6 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>, <6 x float> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 5, i32 4>
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> <i32 0, i32 4, i32 0, i32 4, i32 5, i32 1, i32 5, i32 1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
index 7d65fe1bcde76..401ab79a08b3a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll
@@ -9,16 +9,14 @@ define void @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8
-; CHECK-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 48
 ; CHECK-NEXT:    [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16
+; CHECK-NEXT:    [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <6 x double>, ptr [[M1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <5 x double> [[TMP7]], <5 x double> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 9, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <3 x double>, ptr [[ARRAYIDX_I5_I]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x double> [[TMP4]], <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x double> [[TMP4]], <3 x double> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP3]], <4 x i32> <i32 0, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <5 x double> [[TMP3]], <5 x double> [[TMP1]], <4 x i32> <i32 1, i32 6, i32 9, i32 2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
index 61294089fd4cb..22734f56813bc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll
@@ -5,10 +5,11 @@ define void @test(i32 %0, ptr %p) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 poison, i32 1, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0>
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = extractelement <8 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[PH:%.*]]
 ; CHECK:       ph:
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0>, i32 [[TMP0]], i32 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
index 92a1e289044d7..cea0b1fb126c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
@@ -4,52 +4,56 @@
 define <16 x half> @test(i32 %0, float %1, i32 %2) {
 ; CHECK-LABEL: define <16 x half> @test(
 ; CHECK-SAME: i32 [[TMP0:%.*]], float [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00>, float [[TMP1]], i32 13
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <13 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00>, float [[TMP1]], i32 11
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <13 x float> [[TMP4]] to <13 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP9]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP5]], i32 10
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <13 x i32> [[TMP5]], i32 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult i32 0, 0
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult i32 0, 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i32 0, 0
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 0, 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP5]], i32 4
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <13 x i32> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP22]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]]
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast float 0.000000e+00 to i32
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP25]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = sitofp <16 x i32> [[TMP28]] to <16 x float>
-; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP29]], <16 x float> zeroinitializer, <16 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP31:%.*]] = fadd <16 x float> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP35:%.*]] = select <16 x i1> zeroinitializer, <16 x float> zeroinitializer, <16 x float> [[TMP31]]
-; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x float> [[TMP35]] to <16 x i32>
-; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i32> [[TMP36]], zeroinitializer
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <16 x float>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ult <13 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = select <13 x i1> [[TMP25]], <13 x i32> zeroinitializer, <13 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = sitofp <13 x i32> [[TMP26]] to <13 x float>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <13 x float> [[TMP27]], <13 x float> poison, <14 x i32> <i32 0, i32 1, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+; CHECK-NEXT:    [[TMP29:%.*]] = call <14 x float> @llvm.fmuladd.v14f32(<14 x float> [[TMP28]], <14 x float> zeroinitializer, <14 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP30:%.*]] = fadd <14 x float> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <14 x float> [[TMP30]], <14 x float> poison, <15 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 0, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <15 x i1> <i1 poison, i1 false, i1 poison, i1 poison, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false>, i1 false, i32 0
+; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <15 x i1> [[TMP49]], i1 false, i32 5
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <15 x i1> [[TMP50]], <15 x i1> poison, <15 x i32> <i32 0, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14>
+; CHECK-NEXT:    [[TMP56:%.*]] = select <15 x i1> [[TMP51]], <15 x float> zeroinitializer, <15 x float> [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast <15 x float> [[TMP56]] to <15 x i32>
+; CHECK-NEXT:    [[TMP58:%.*]] = and <15 x i32> [[TMP57]], zeroinitializer
+; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <15 x i32> [[TMP58]] to <15 x float>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <15 x float> [[TMP59]], <15 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
 ; CHECK-NEXT:    [[TMP53:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <16 x float> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> zeroinitializer, <16 x float> [[TMP38]], <16 x float> [[TMP39]])
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP29]], i32 14
-; CHECK-NEXT:    [[TMP44:%.*]] = fcmp ogt float [[TMP43]], 0.000000e+00
-; CHECK-NEXT:    [[TMP45:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP29]], i32 13
-; CHECK-NEXT:    [[TMP47:%.*]] = fcmp ogt float [[TMP46]], 0.000000e+00
-; CHECK-NEXT:    [[TMP48:%.*]] = fcmp olt float [[TMP46]], 0.000000e+00
-; CHECK-NEXT:    [[TMP49:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x float> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP51:%.*]] = fcmp ogt float [[TMP50]], 0.000000e+00
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <14 x float> [[TMP28]], i32 0
+; CHECK-NEXT:    [[TMP44:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <14 x float> [[TMP28]], i32 13
+; CHECK-NEXT:    [[TMP46:%.*]] = fcmp olt float [[TMP45]], 0.000000e+00
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <14 x float> [[TMP28]], i32 12
+; CHECK-NEXT:    [[TMP48:%.*]] = fcmp olt float [[TMP47]], 0.000000e+00
+; CHECK-NEXT:    [[TMP54:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <14 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <13 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <13 x float> [[TMP27]], <13 x float> [[TMP61]], <4 x i32> <i32 1, i32 11, i32 12, i32 13>
+; CHECK-NEXT:    [[TMP63:%.*]] = fcmp ogt <4 x float> [[TMP62]], zeroinitializer
 ; CHECK-NEXT:    [[TMP52:%.*]] = fcmp oeq <16 x float> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    ret <16 x half> zeroinitializer
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
index b900bd3a8c331..67d43fe31b5f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll
@@ -14,12 +14,12 @@ define i32 @foo(i32 %0, ptr %1, ptr %2, i1 %arg)  {
 ; CHECK-NEXT:    br label [[T37:%.*]]
 ; CHECK:       t37:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv fast <2 x float> splat (float 1.000000e+00), [[TMP6]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[TMP7]]
 ; CHECK-NEXT:    [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[T4]], i64 0, i32 2, i64 0
 ; CHECK-NEXT:    store <4 x float> [[SHUFFLE]], ptr [[T21]], align 4
 ; CHECK-NEXT:    [[T89]] = load <2 x float>, ptr [[T9]], align 4
-; CHECK-NEXT:    br i1 %arg, label [[T37]], label [[T55:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[T37]], label [[T55:%.*]]
 ; CHECK:       t55:
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
index f7811aba5ab5f..57b46274f4a13 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -4,20 +4,13 @@
 define <4 x i16> @test() {
 ; CHECK-LABEL: define <4 x i16> @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
-; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
+; CHECK-NEXT:    [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
+; CHECK-NEXT:    [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
 ; CHECK-NEXT:    [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
 ; CHECK-NEXT:    ret <4 x i16> [[OP_RDX9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
index 9c0f65ec27165..38f49791c570d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll
@@ -22,17 +22,17 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) {
 ; CHECK:       [[D_EXIT_6]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP1]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ [[TMP1]], %[[D_EXIT_4]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    br label %[[D_EXIT_7]]
 ; CHECK:       [[D_EXIT_7]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ [[TMP13]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> <i32 poison, i32 1, i32 1, i32 poison>, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3
-; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
index 648f051db4a52..c796320069e42 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -57,11 +57,11 @@ define internal i32 @ipvideo_decode_block_opcode_0xD_16() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <8 x i16> [ undef, [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    store <8 x i16> [[TMP1]], ptr undef, align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
index b4996eb58b47e..798c94e84c99d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
@@ -20,7 +20,6 @@ define void @test() {
 ; CHECK-NEXT:    [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]])
 ; CHECK-NEXT:    [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> <i32 0, i32 4>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> <i32 1, i32 5>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll b/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll
index 5e52bd7cc954c..27217459fa7b8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll
@@ -12,9 +12,9 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[VAL_SROA_6_1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uge <2 x float> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <12 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <12 x i1> <i1 true, i1 undef, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 undef>, <12 x i1> [[TMP3]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 12, i32 13, i32 11>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <12 x i1> [[TMP4]], i1 [[CMP119]], i32 11
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <12 x i1> [[TMP5]], <12 x i1> poison, <12 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <12 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 undef, i1 undef>, <12 x i1> [[TMP3]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <12 x i1> [[TMP4]], i1 [[CMP119]], i32 10
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <12 x i1> [[TMP5]], <12 x i1> poison, <12 x i32> <i32 0, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
 ; CHECK-NEXT:    [[TMP7]] = select <12 x i1> [[TMP6]], <12 x float> zeroinitializer, <12 x float> zeroinitializer
 ; CHECK-NEXT:    br label %[[DO_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 32e59697486a7..ee98c6ec29a54 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -7,28 +7,31 @@ define i1 @test(float %0, double %1) {
 ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; X86-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
-; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; X86-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
-; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
-; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
-; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
-; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
-; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
-; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
-; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
-; X86-NEXT:    ret i1 [[TMP25]]
+; X86-NEXT:    [[TMP5:%.*]] = fmul double 0.000000e+00, 0.000000e+00
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, double [[TMP1]], i32 1
+; X86-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[TMP5]], i32 2
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
+; X86-NEXT:    [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = insertelement <5 x double> poison, double [[TMP1]], i32 4
+; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; X86-NEXT:    [[TMP13:%.*]] = shufflevector <5 x double> [[TMP11]], <5 x double> [[TMP12]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; X86-NEXT:    [[TMP14:%.*]] = fmul <5 x double> zeroinitializer, [[TMP13]]
+; X86-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison>
+; X86-NEXT:    [[TMP16:%.*]] = shufflevector <5 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00>, <5 x double> [[TMP15]], <5 x i32> <i32 5, i32 6, i32 7, i32 8, i32 4>
+; X86-NEXT:    [[TMP17:%.*]] = fsub <5 x double> [[TMP16]], [[TMP14]]
+; X86-NEXT:    [[TMP18:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <3 x i32> <i32 2, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP19:%.*]] = shufflevector <3 x double> [[TMP18]], <3 x double> <double poison, double 0.000000e+00, double 0.000000e+00>, <3 x i32> <i32 0, i32 4, i32 5>
+; X86-NEXT:    [[TMP20:%.*]] = fmul <3 x double> zeroinitializer, [[TMP19]]
+; X86-NEXT:    [[TMP21:%.*]] = shufflevector <5 x double> [[TMP17]], <5 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP22:%.*]] = shufflevector <3 x double> [[TMP20]], <3 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP23:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 10>
+; X86-NEXT:    [[TMP24:%.*]] = fptrunc <8 x double> [[TMP23]] to <8 x float>
+; X86-NEXT:    [[TMP25:%.*]] = fmul <8 x float> [[TMP24]], zeroinitializer
+; X86-NEXT:    [[TMP26:%.*]] = fcmp oeq <8 x float> [[TMP25]], zeroinitializer
+; X86-NEXT:    [[TMP27:%.*]] = freeze <8 x i1> [[TMP26]]
+; X86-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP27]])
+; X86-NEXT:    ret i1 [[TMP28]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
index fff988a0a746e..c814b71e3dd1b 100644
--- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
@@ -13,7 +13,7 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> <i8 0, i8 0, i8 poison, i8 0, i8 0, i8 poison, i8 0, i8 0>, <8 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 9, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i48> <i48 poison, i48 0, i48 0, i48 0>, i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i48> [[TMP9]] to <4 x i8>
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
index dbf24bc7721c9..a7fa1607d8f32 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll
@@ -14,7 +14,7 @@ define void @test(i8 %0, i8 %1) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
index 304af88b6d134..f05c846027910 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll
@@ -7,13 +7,8 @@
 define i64 @test(ptr %p) {
 ; RISCV-LABEL: @test(
 ; RISCV-NEXT:  entry:
-; RISCV-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4
-; RISCV-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4
-; RISCV-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4
-; RISCV-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 0, i32 0>
-; RISCV-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[TMP2]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 6, i32 7>
-; RISCV-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; RISCV-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; RISCV-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4
+; RISCV-NEXT:    [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 0, i32 0>
 ; RISCV-NEXT:    [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], splat (i64 42)
 ; RISCV-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; RISCV-NEXT:    ret i64 [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
index 940ee5b95871d..d6a35d21332ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
@@ -39,10 +39,10 @@ define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227)
 ; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
 ; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
 ; AARCH64-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
-; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
-; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
-; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> <i32 2, i32 0, i32 poison, i32 poison, i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I245]], i32 2
+; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I248]], i32 3
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <8 x ptr> [[TMP7]], <8 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 2, i32 3>
 ; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AARCH64-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
 ; AARCH64-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index b738d25b39be1..7535f277fba4b 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -7,10 +7,9 @@ define void @test1(ptr %in, ptr %out) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret void
@@ -19,10 +18,9 @@ define void @test1(ptr %in, ptr %out) {
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
-; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    store <8 x i64> [[TMP1]], ptr [[OUT]], align 8
 ; COMBINE-NEXT:    ret void
@@ -53,10 +51,9 @@ define void @test2(ptr %in, ptr %out) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret void
@@ -65,10 +62,9 @@ define void @test2(ptr %in, ptr %out) {
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
-; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
-; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    store <8 x i64> [[TMP2]], ptr [[OUT]], align 8
 ; COMBINE-NEXT:    ret void
@@ -97,7 +93,7 @@ entry:
 define void @test3(<16 x i32> %0, ptr %out) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
 ; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
@@ -105,7 +101,7 @@ define void @test3(<16 x i32> %0, ptr %out) {
 ;
 ; COMBINE-LABEL: @test3(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
 ; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
@@ -132,7 +128,7 @@ define void @test4(ptr %in, ptr %out) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4
 ; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
@@ -142,7 +138,7 @@ define void @test4(ptr %in, ptr %out) {
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4
 ; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
-; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
@@ -189,36 +185,29 @@ entry:
 define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32
+; CHECK-NEXT:    [[TMP0:%.*]] = load <12 x float>, ptr [[GEP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <12 x float> [[TMP0]], <12 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <32 x i8> [[TMP8]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <32 x i16> [[TMP10]] to <32 x float>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP16]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <32 x float> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    store <32 x float> [[TMP4]], ptr [[IN2:%.*]], align 16
 ; CHECK-NEXT:    [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32
-; CHECK-NEXT:    [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128
-; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x float>, ptr [[IN0]], align 16
-; CHECK-NEXT:    store <32 x float> [[TMP4]], ptr [[IN2]], align 16
 ; CHECK-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2]], i64 128
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP6:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP22]], <16 x float> [[TMP24]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
@@ -228,36 +217,29 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ;
 ; COMBINE-LABEL: @test6(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32
+; COMBINE-NEXT:    [[TMP0:%.*]] = load <12 x float>, ptr [[GEP1:%.*]], align 16
+; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <12 x float> [[TMP0]], <12 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16
-; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1
-; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; COMBINE-NEXT:    [[TMP4:%.*]] = zext <32 x i8> [[TMP11]] to <32 x i16>
-; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; COMBINE-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; COMBINE-NEXT:    [[TMP2:%.*]] = uitofp <32 x i16> [[TMP19]] to <32 x float>
-; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]]
+; COMBINE-NEXT:    store <32 x float> [[TMP7]], ptr [[IN2:%.*]], align 16
 ; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32
-; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128
-; COMBINE-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16
-; COMBINE-NEXT:    store <32 x float> [[TMP7]], ptr [[IN2]], align 16
 ; COMBINE-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
-; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP20:%.*]] = shufflevector <32 x float> [[TMP6]], <32 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; COMBINE-NEXT:    [[TMP23:%.*]] = shufflevector <32 x float> [[TMP6]], <32 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2]], i64 128
+; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
-; COMBINE-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float>
-; COMBINE-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; COMBINE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP22:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; COMBINE-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP22:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; COMBINE-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP22]], <16 x float> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; COMBINE-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index ac8b10a0087d0..418d1170cfff0 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -224,8 +224,8 @@ define void @test8() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ [[TMP7]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP8]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8]] = phi <8 x float> [ [[TMP2:%.*]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br i1 false, label [[FOR0]], label [[FOR_BODY]]
 ;
 entry:
@@ -268,10 +268,9 @@ define void @test10() {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr null, align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[TMP8]] to <16 x i32>
@@ -380,7 +379,7 @@ entry:
 define void @test13(<8 x i32> %0, ptr %out0, ptr %out1, ptr %out2) {
 ; CHECK-LABEL: @test13(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0:%.*]], <8 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0:%.*]], <8 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       for.end.loopexit:
@@ -416,10 +415,9 @@ for.end.loopexit:
 define void @test14(<8 x i1> %0) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0:%.*]], <8 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[TMP0:%.*]], <8 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <16 x i16> [ [[TMP5]], [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
index cf5927bf58327..4b110143ec1e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll
@@ -1,22 +1,37 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s %}
-; RUN: %if aaarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s --check-prefix=X86  %}
+; RUN: %if aaarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define i32 @test() {
-; CHECK-LABEL: define i32 @test() {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label [[BB1:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ [[TMP5:%.*]], [[BB3:%.*]] ], [ zeroinitializer, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    br i1 false, label [[BB4:%.*]], label [[BB3]]
-; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP5]] = add <2 x i32> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    br label [[BB1]]
-; CHECK:       bb4:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x i32> [ [[TMP1]], [[BB1]] ]
-; CHECK-NEXT:    ret i32 0
+; X86-LABEL: define i32 @test() {
+; X86-NEXT:  bb:
+; X86-NEXT:    br label [[BB1:%.*]]
+; X86:       bb1:
+; X86-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ [[TMP4:%.*]], [[BB3:%.*]] ], [ zeroinitializer, [[BB:%.*]] ]
+; X86-NEXT:    br i1 false, label [[BB4:%.*]], label [[BB3]]
+; X86:       bb3:
+; X86-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; X86-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
+; X86-NEXT:    [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]]
+; X86-NEXT:    [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; X86-NEXT:    br label [[BB1]]
+; X86:       bb4:
+; X86-NEXT:    [[TMP5:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ]
+; X86-NEXT:    ret i32 0
+;
+; AARCH64-LABEL: define i32 @test() {
+; AARCH64-NEXT:  bb:
+; AARCH64-NEXT:    br label [[BB1:%.*]]
+; AARCH64:       bb1:
+; AARCH64-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ [[TMP2:%.*]], [[BB3:%.*]] ], [ zeroinitializer, [[BB:%.*]] ]
+; AARCH64-NEXT:    br i1 false, label [[BB4:%.*]], label [[BB3]]
+; AARCH64:       bb3:
+; AARCH64-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x i32> <i32 0, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP2]] = add <8 x i32> zeroinitializer, [[TMP1]]
+; AARCH64-NEXT:    br label [[BB1]]
+; AARCH64:       bb4:
+; AARCH64-NEXT:    [[TMP3:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ]
+; AARCH64-NEXT:    ret i32 0
 ;
 bb:
   br label %bb1



More information about the llvm-commits mailing list