[llvm] [SLP]Improve vectorization of gathered loads. (PR #89129)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Wed May 1 07:00:08 PDT 2024


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/89129

>From 00d7fe551ea4ba71f572d2f8c211977df101a16a Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 17 Apr 2024 19:46:10 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 728 ++++++++++++++++--
 .../SLPVectorizer/AArch64/loadorder.ll        | 231 +++---
 .../AArch64/multiple_reduction.ll             | 298 ++++---
 .../SLPVectorizer/AArch64/tsc-s116.ll         |  22 +-
 .../SLPVectorizer/AArch64/vec3-calls.ll       |  27 +-
 .../vectorizable-selects-uniform-cmps.ll      |  43 +-
 .../SLPVectorizer/RISCV/complex-loads.ll      | 384 +++++----
 .../SLPVectorizer/RISCV/reductions.ll         |  44 +-
 .../SLPVectorizer/SystemZ/pr34619.ll          |  13 +-
 .../Transforms/SLPVectorizer/X86/PR32086.ll   |  28 +-
 .../Transforms/SLPVectorizer/X86/PR35777.ll   |  25 +-
 .../Transforms/SLPVectorizer/X86/addsub.ll    |  42 +-
 .../X86/bottom-to-top-reorder.ll              |  30 +-
 .../X86/buildvector-same-lane-insert.ll       |  22 +-
 .../SLPVectorizer/X86/horizontal-minmax.ll    |  50 +-
 .../X86/load-merge-inseltpoison.ll            |  53 +-
 .../SLPVectorizer/X86/load-merge.ll           |  53 +-
 .../X86/load-partial-vector-shuffle.ll        |  39 +-
 .../Transforms/SLPVectorizer/X86/lookahead.ll | 125 +--
 .../X86/minbitwidth-user-not-min.ll           |   2 +-
 .../SLPVectorizer/X86/operandorder.ll         |  36 +-
 llvm/test/Transforms/SLPVectorizer/X86/phi.ll |  43 +-
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 378 +++++----
 .../Transforms/SLPVectorizer/X86/pr47629.ll   | 378 +++++----
 .../SLPVectorizer/X86/pr48879-sroa.ll         | 162 ++--
 .../X86/reduced-gathered-vectorized.ll        |  27 +-
 .../SLPVectorizer/X86/reduction_loads.ll      |  30 +-
 ...masked-loads-consecutive-loads-same-ptr.ll |   2 +-
 .../X86/remark-partial-loads-vectorize.ll     |  24 +-
 .../X86/remark_gather-load-redux-cost.ll      |   2 +-
 .../SLPVectorizer/X86/remark_horcost.ll       |  11 +-
 .../SLPVectorizer/X86/remark_not_all_parts.ll |   2 +-
 .../X86/reorder-clustered-node.ll             |  25 +-
 .../SLPVectorizer/X86/reorder-phi-operand.ll  |  17 +-
 .../X86/reorder-possible-strided-node.ll      |  32 +-
 .../X86/reorder-reused-masked-gather2.ll      |   6 +-
 .../SLPVectorizer/X86/reorder_phi.ll          |  38 +-
 .../Transforms/SLPVectorizer/X86/return.ll    |  14 +-
 .../X86/scatter-vectorize-reorder.ll          |  17 +-
 .../X86/scatter-vectorize-reused-pointer.ll   |  18 +-
 .../SLPVectorizer/X86/scheduling.ll           |   9 +-
 .../SLPVectorizer/X86/shrink_after_reorder.ll |  15 +-
 .../Transforms/SLPVectorizer/X86/sin-sqrt.ll  |  99 +--
 .../SLPVectorizer/X86/split-load8_2-unord.ll  |  49 +-
 .../X86/split-load8_2_unord_geps.ll           |  24 +-
 .../X86/store-insertelement-minbitwidth.ll    |   4 +-
 .../Transforms/SLPVectorizer/X86/supernode.ll |  81 +-
 .../SLPVectorizer/X86/vec3-calls.ll           |  14 +-
 .../X86/vec_list_bias-inseltpoison.ll         |  39 +-
 .../SLPVectorizer/X86/vec_list_bias.ll        |  39 +-
 .../vec_list_bias_external_insert_shuffled.ll |  31 +-
 .../X86/vect-gather-same-nodes.ll             |  25 +-
 .../X86/vectorize-reordered-list.ll           |  14 +-
 53 files changed, 2123 insertions(+), 1841 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 806e8085038b35..6730d0c4db7fea 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1133,6 +1133,8 @@ class BoUpSLP {
     MultiNodeScalars.clear();
     MustGather.clear();
     EntryToLastInstruction.clear();
+    GatheredLoads.clear();
+    GatheredLoadsEntriesFirst = NoGatheredLoads;
     ExternalUses.clear();
     ExternalUsesAsGEPs.clear();
     for (auto &Iter : BlocksSchedules) {
@@ -1170,8 +1172,9 @@ class BoUpSLP {
   /// identity order is important, or the actual order.
   /// \param TopToBottom If true, include the order of vectorized stores and
   /// insertelement nodes, otherwise skip them.
-  std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
-                                              bool TopToBottom);
+  std::optional<OrdersType> getReorderingData(
+      const TreeEntry &TE, bool TopToBottom,
+      DenseMap<const TreeEntry *, TreeEntry *> &ScatterVectorizeToReorder);
 
   /// Reorders the current graph to the most profitable order starting from the
   /// root node to the leaf nodes. The best order is chosen only from the nodes
@@ -2558,6 +2561,11 @@ class BoUpSLP {
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree(bool ForReduction) const;
 
+  /// Run through the list of all gathered loads in the graph and try to find
+  /// vector loads/masked gathers instead of regular gathers. Later these loads
+  /// are reshufled to build final gathered nodes.
+  void tryToVectorizeGatheredLoads();
+
   /// Reorder commutative or alt operands to get better probability of
   /// generating vectorized code.
   static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
@@ -3010,6 +3018,14 @@ class BoUpSLP {
         CastMaxMinBWSizes =
             std::make_pair(std::numeric_limits<unsigned>::max(), 1);
       MustGather.insert(VL.begin(), VL.end());
+      if (GatheredLoadsEntriesFirst == NoGatheredLoads ||
+          Last->Idx < GatheredLoadsEntriesFirst || UserTreeIdx.UserTE ||
+          S.getOpcode() != Instruction::Load) {
+        // Build a map for gathered scalars to the nodes where they are used.
+        for (Value *V : VL)
+          if (!isConstant(V))
+            ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
+      }
     }
 
     if (UserTreeIdx.UserTE) {
@@ -3085,6 +3101,14 @@ class BoUpSLP {
       DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
   ValueToGatherNodesMap ValueToGatherNodes;
 
+  /// A list of loads to be gathered during the vectorization process. We can
+  /// try to vectorize them at the end, if profitable.
+  SmallVector<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads;
+
+  /// The index of the first gathered load entry in the VectorizeTree.
+  constexpr static int NoGatheredLoads = -1;
+  int GatheredLoadsEntriesFirst = NoGatheredLoads;
+
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -4604,12 +4628,17 @@ static bool areTwoInsertFromSameBuildVector(
   return false;
 }
 
-std::optional<BoUpSLP::OrdersType>
-BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
+std::optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(
+    const TreeEntry &TE, bool TopToBottom,
+    DenseMap<const TreeEntry *, TreeEntry *> &ScatterVectorizeToReorder) {
   // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
   if (TE.isNonPowOf2Vec())
     return std::nullopt;
 
+  if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+      TE.Idx >= GatheredLoadsEntriesFirst && TE.UserTreeIndices.empty() &&
+      &TE != VectorizableTree.front().get())
+    return std::nullopt;
   // No need to reorder if need to shuffle reuses, still need to shuffle the
   // node.
   if (!TE.ReuseShuffleIndices.empty()) {
@@ -4781,6 +4810,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       return std::nullopt; // No need to reorder.
     return std::move(ResOrder);
   }
+  bool LoadsScatterVectorize = false;
   if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
       allSameType(TE.Scalars)) {
     // TODO: add analysis of other gather nodes with extractelement
@@ -4843,8 +4873,32 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     if (TE.Scalars.size() >= 4)
       if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
         return Order;
-    if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
+    if (TE.getOpcode() == Instruction::Load) {
+      SmallVector<Value *> PointerOps;
+      OrdersType CurrentOrder;
+      LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
+                                         CurrentOrder, PointerOps);
+      if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+        return std::move(CurrentOrder);
+      LoadsScatterVectorize = Res == LoadsState::ScatterVectorize;
+    }
+    if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) {
+      if (LoadsScatterVectorize) {
+        if (TreeEntry *ScatterVectorTE = getTreeEntry(TE.Scalars.front());
+            ScatterVectorTE &&
+            ScatterVectorTE->Idx >= GatheredLoadsEntriesFirst &&
+            ScatterVectorTE->UserTreeIndices.empty() &&
+            ScatterVectorTE->State == TreeEntry::ScatterVectorize &&
+            ScatterVectorTE->Scalars.size() == TE.Scalars.size() &&
+            all_of(TE.Scalars, [&](Value *V) {
+              return getTreeEntry(V) == ScatterVectorTE;
+            })) {
+          ScatterVectorizeToReorder.try_emplace(&TE, ScatterVectorTE);
+          return std::nullopt;
+        }
+      }
       return CurrentOrder;
+    }
   }
   return std::nullopt;
 }
@@ -4930,73 +4984,83 @@ void BoUpSLP::reorderTopToBottom() {
   // Maps a TreeEntry to the reorder indices of external users.
   DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
       ExternalUserReorderMap;
+  // Nodes with loads masked gathering built out of gathered loads that should
+  // be reordered to avoid extra shuffles.
+  DenseMap<const TreeEntry *, TreeEntry *> ScatterVectorizeToReorder;
   // Find all reorderable nodes with the given VF.
   // Currently the are vectorized stores,loads,extracts + some gathering of
   // extracts.
-  for_each(VectorizableTree, [&, &TTIRef = *TTI](
-                                 const std::unique_ptr<TreeEntry> &TE) {
-    // Look for external users that will probably be vectorized.
-    SmallVector<OrdersType, 1> ExternalUserReorderIndices =
-        findExternalStoreUsersReorderIndices(TE.get());
-    if (!ExternalUserReorderIndices.empty()) {
-      VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
-      ExternalUserReorderMap.try_emplace(TE.get(),
-                                         std::move(ExternalUserReorderIndices));
-    }
-
-    // Patterns like [fadd,fsub] can be combined into a single instruction in
-    // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
-    // to take into account their order when looking for the most used order.
-    if (TE->isAltShuffle()) {
-      VectorType *VecTy =
-          FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
-      unsigned Opcode0 = TE->getOpcode();
-      unsigned Opcode1 = TE->getAltOpcode();
-      // The opcode mask selects between the two opcodes.
-      SmallBitVector OpcodeMask(TE->Scalars.size(), false);
-      for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
-        if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
-          OpcodeMask.set(Lane);
-      // If this pattern is supported by the target then we consider the order.
-      if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
-        VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
-        AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
-      }
-      // TODO: Check the reverse order too.
-    }
+  for_each(
+      ArrayRef(VectorizableTree)
+          .drop_back(GatheredLoadsEntriesFirst == NoGatheredLoads
+                         ? 0
+                         : VectorizableTree.size() - GatheredLoadsEntriesFirst),
+      [&, &TTIRef = *TTI](const std::unique_ptr<TreeEntry> &TE) {
+        // Look for external users that will probably be vectorized.
+        SmallVector<OrdersType, 1> ExternalUserReorderIndices =
+            findExternalStoreUsersReorderIndices(TE.get());
+        if (!ExternalUserReorderIndices.empty()) {
+          VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
+          ExternalUserReorderMap.try_emplace(
+              TE.get(), std::move(ExternalUserReorderIndices));
+        }
 
-    if (std::optional<OrdersType> CurrentOrder =
-            getReorderingData(*TE, /*TopToBottom=*/true)) {
-      // Do not include ordering for nodes used in the alt opcode vectorization,
-      // better to reorder them during bottom-to-top stage. If follow the order
-      // here, it causes reordering of the whole graph though actually it is
-      // profitable just to reorder the subgraph that starts from the alternate
-      // opcode vectorization node. Such nodes already end-up with the shuffle
-      // instruction and it is just enough to change this shuffle rather than
-      // rotate the scalars for the whole graph.
-      unsigned Cnt = 0;
-      const TreeEntry *UserTE = TE.get();
-      while (UserTE && Cnt < RecursionMaxDepth) {
-        if (UserTE->UserTreeIndices.size() != 1)
-          break;
-        if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
-              return EI.UserTE->State == TreeEntry::Vectorize &&
-                     EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
-            }))
-          return;
-        UserTE = UserTE->UserTreeIndices.back().UserTE;
-        ++Cnt;
-      }
-      VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
-      if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) ||
-          !TE->ReuseShuffleIndices.empty())
-        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
-      if (TE->State == TreeEntry::Vectorize &&
-          TE->getOpcode() == Instruction::PHI)
-        PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
-    }
-  });
+        // Patterns like [fadd,fsub] can be combined into a single instruction
+        // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
+        // need to take into account their order when looking for the most used
+        // order.
+        if (TE->isAltShuffle()) {
+          VectorType *VecTy = FixedVectorType::get(TE->Scalars[0]->getType(),
+                                                   TE->Scalars.size());
+          unsigned Opcode0 = TE->getOpcode();
+          unsigned Opcode1 = TE->getAltOpcode();
+          // The opcode mask selects between the two opcodes.
+          SmallBitVector OpcodeMask(TE->Scalars.size(), false);
+          for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
+            if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
+              OpcodeMask.set(Lane);
+          // If this pattern is supported by the target then we consider the
+          // order.
+          if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+            VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
+            AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
+          }
+          // TODO: Check the reverse order too.
+        }
+
+        if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+                *TE, /*TopToBottom=*/true, ScatterVectorizeToReorder)) {
+          // Do not include ordering for nodes used in the alt opcode
+          // vectorization, better to reorder them during bottom-to-top stage.
+          // If follow the order here, it causes reordering of the whole graph
+          // though actually it is profitable just to reorder the subgraph that
+          // starts from the alternate opcode vectorization node. Such nodes
+          // already end-up with the shuffle instruction and it is just enough
+          // to change this shuffle rather than rotate the scalars for the whole
+          // graph.
+          unsigned Cnt = 0;
+          const TreeEntry *UserTE = TE.get();
+          while (UserTE && Cnt < RecursionMaxDepth) {
+            if (UserTE->UserTreeIndices.size() != 1)
+              break;
+            if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
+                  return EI.UserTE->State == TreeEntry::Vectorize &&
+                         EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
+                }))
+              return;
+            UserTE = UserTE->UserTreeIndices.back().UserTE;
+            ++Cnt;
+          }
+          VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
+          if (!(TE->State == TreeEntry::Vectorize ||
+                TE->State == TreeEntry::StridedVectorize) ||
+              !TE->ReuseShuffleIndices.empty())
+            GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+          if (TE->State == TreeEntry::Vectorize &&
+              TE->getOpcode() == Instruction::PHI)
+            PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
+        }
+      });
 
   // Reorder the graph nodes according to their vectorization factor.
   for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
@@ -5126,6 +5190,10 @@ void BoUpSLP::reorderTopToBottom() {
     });
     // Do an actual reordering, if profitable.
     for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+      // Do not reorder gathered loads.
+      if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+          TE->Idx >= GatheredLoadsEntriesFirst)
+        continue;
       // Just do the reordering for the nodes with the given VF.
       if (TE->Scalars.size() != VF) {
         if (TE->ReuseShuffleIndices.size() == VF) {
@@ -5242,12 +5310,20 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   // Currently the are vectorized loads,extracts without alternate operands +
   // some gathering of extracts.
   SmallVector<TreeEntry *> NonVectorized;
-  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+  // Nodes with loads masked gathering built out of gathered loads that should
+  // be reordered to avoid extra shuffles.
+  DenseMap<const TreeEntry *, TreeEntry *> ScatterVectorizeToReorder;
+  for (const std::unique_ptr<TreeEntry> &TE :
+       ArrayRef(VectorizableTree)
+           .drop_back(GatheredLoadsEntriesFirst == NoGatheredLoads
+                          ? 0
+                          : VectorizableTree.size() -
+                                GatheredLoadsEntriesFirst)) {
     if (TE->State != TreeEntry::Vectorize &&
         TE->State != TreeEntry::StridedVectorize)
       NonVectorized.push_back(TE.get());
-    if (std::optional<OrdersType> CurrentOrder =
-            getReorderingData(*TE, /*TopToBottom=*/false)) {
+    if (std::optional<OrdersType> CurrentOrder = getReorderingData(
+            *TE, /*TopToBottom=*/false, ScatterVectorizeToReorder)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize) ||
@@ -5284,6 +5360,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       // search. The graph currently does not provide this dependency directly.
       for (EdgeInfo &EI : TE->UserTreeIndices) {
         TreeEntry *UserTE = EI.UserTE;
+        if (!UserTE)
+          continue;
         auto It = Users.find(UserTE);
         if (It == Users.end())
           It = Users.insert({UserTE, {}}).first;
@@ -5300,6 +5378,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       return Data1.first->Idx > Data2.first->Idx;
     });
     for (auto &Data : UsersVec) {
+      if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+          Data.first->Idx >= GatheredLoadsEntriesFirst)
+        llvm_unreachable("Gathered loads nodes must not be reordered.");
       // Check that operands are used only in the User node.
       SmallVector<TreeEntry *> GatherOps;
       if (!canReorderOperands(Data.first, Data.second, NonVectorized,
@@ -5327,7 +5408,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         const auto Order = [&]() -> const OrdersType {
           if (OpTE->State == TreeEntry::NeedToGather ||
               !OpTE->ReuseShuffleIndices.empty())
-            return getReorderingData(*OpTE, /*TopToBottom=*/false)
+            return getReorderingData(*OpTE, /*TopToBottom=*/false,
+                                     ScatterVectorizeToReorder)
                 .value_or(OrdersType(1));
           return OpTE->ReorderIndices;
         }();
@@ -5366,7 +5448,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             return true;
           if (TE->State == TreeEntry::NeedToGather) {
             if (GathersToOrders.contains(TE))
-              return !getReorderingData(*TE, /*TopToBottom=*/false)
+              return !getReorderingData(*TE, /*TopToBottom=*/false,
+                                        ScatterVectorizeToReorder)
                           .value_or(OrdersType(1))
                           .empty();
             return true;
@@ -5515,6 +5598,71 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
       VectorizableTree.front()->ReuseShuffleIndices.empty())
     VectorizableTree.front()->ReorderIndices.clear();
+  // Reorder masked gather nodes built out of gathered loads.
+  SmallPtrSet<const TreeEntry *, 4> Processed;
+  for (const auto &SVLoadsData : ScatterVectorizeToReorder) {
+    if (!Processed.insert(SVLoadsData.second).second)
+      continue;
+    std::optional<OrdersType> CurrentOrder =
+        findReusedOrderedScalars(*SVLoadsData.first);
+    assert(CurrentOrder && "Expected order.");
+    if (CurrentOrder->empty() || !SVLoadsData.second->UserTreeIndices.empty())
+      continue;
+    SmallVector<TreeEntry *> Operands;
+    SmallVector<const TreeEntry *> Worklist(1, SVLoadsData.second);
+    while (!Worklist.empty()) {
+      const TreeEntry *CurrentEntry = Worklist.pop_back_val();
+      for (unsigned I = 0, E = CurrentEntry->getNumOperands(); I < E; ++I) {
+        if (CurrentEntry->getOpcode() == Instruction::ExtractElement)
+          continue;
+        if (CurrentEntry->getOpcode() == Instruction::Call) {
+          auto *CI = cast<CallInst>(CurrentEntry->getMainOp());
+          Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+          if (I >= CI->arg_size() ||
+              isVectorIntrinsicWithScalarOpAtArg(ID, I))
+            continue;
+        }
+        const TreeEntry *Op = getOperandEntry(CurrentEntry, I);
+        if (Op->ReuseShuffleIndices.empty())
+          Worklist.push_back(Op);
+        Operands.push_back(const_cast<TreeEntry *>(Op));
+      }
+    }
+    // If there are several users of the pointers tree entry, no need to
+    // reorder the scatter vectorize node, still have same number of shuffles.
+    if (any_of(Operands, [](const TreeEntry *TE) {
+          return TE->UserTreeIndices.size() > 1;
+        }))
+      continue;
+    // Reorder related masked gather node and its operands.
+    SmallVector<int> Mask(CurrentOrder->size(), PoisonMaskElem);
+    unsigned E = CurrentOrder->size();
+    transform(*CurrentOrder, Mask.begin(), [E](unsigned I) {
+      return I < E ? static_cast<int>(I) : PoisonMaskElem;
+    });
+    for (TreeEntry *OpTE : Operands) {
+      if (!OpTE->ReuseShuffleIndices.empty()) {
+        reorderReuses(OpTE->ReuseShuffleIndices, Mask);
+      } else if (OpTE->State == TreeEntry::NeedToGather) {
+        if (OpTE->ReorderIndices.empty())
+          reorderScalars(OpTE->Scalars, Mask);
+        else
+          reorderOrder(OpTE->ReorderIndices, Mask);
+      } else {
+        OpTE->reorderOperands(Mask);
+        if (OpTE->ReorderIndices.empty())
+          reorderScalars(OpTE->Scalars, Mask);
+        else
+          reorderOrder(OpTE->ReorderIndices, Mask);
+      }
+    }
+    SVLoadsData.second->reorderOperands(Mask);
+    if (SVLoadsData.second->ReorderIndices.empty())
+      reorderScalars(SVLoadsData.second->Scalars, Mask);
+    else
+      reorderOrder(SVLoadsData.second->ReorderIndices, Mask,
+                   /*BottomOrder=*/true);
+  }
 }
 
 void BoUpSLP::buildExternalUses(
@@ -5743,6 +5891,15 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, EdgeInfo());
+  // Try to vectorize gathered loads if this is not just a gather of loads.
+  if (!GatheredLoads.empty() &&
+      !(VectorizableTree.size() == 2 &&
+        VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
+        VectorizableTree.front()->State == TreeEntry::Vectorize &&
+        VectorizableTree.front()->State == TreeEntry::StridedVectorize &&
+        VectorizableTree.back()->State == TreeEntry::NeedToGather &&
+        VectorizableTree.back()->getOpcode() != Instruction::Load))
+    tryToVectorizeGatheredLoads();
 }
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
@@ -5750,6 +5907,321 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
   if (!allSameType(Roots))
     return;
   buildTree_rec(Roots, 0, EdgeInfo());
+  // Try to vectorize gathered loads if this is not just a gather of loads.
+  if (!GatheredLoads.empty() &&
+      !(VectorizableTree.size() == 2 &&
+        VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
+        VectorizableTree.front()->State == TreeEntry::Vectorize &&
+        VectorizableTree.front()->State == TreeEntry::StridedVectorize &&
+        VectorizableTree.back()->State == TreeEntry::NeedToGather &&
+        VectorizableTree.back()->getOpcode() != Instruction::Load))
+    tryToVectorizeGatheredLoads();
+}
+
+/// Tries to find subvector of loads and builds new vector of only loads if can
+/// be profitable.
+static bool gatherPossiblyVectorizableLoads(
+    const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
+    ScalarEvolution &SE, const TargetTransformInfo &TTI,
+    SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
+    bool AddNew = true) {
+  if (VL.empty())
+    return true;
+  if (!isValidElementType(VL.front()->getType()))
+    return false;
+  Type *ScalarTy = VL.front()->getType();
+  int NumScalars = VL.size();
+  auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
+  int NumParts = TTI.getNumberOfParts(VecTy);
+  if (NumParts == 0 || NumParts >= NumScalars)
+    NumParts = 1;
+  unsigned VF = PowerOf2Ceil(NumScalars / NumParts);
+  SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads;
+  for (int I : seq<int>(0, NumParts)) {
+    for (Value *V :
+         VL.slice(I * VF, std::min<unsigned>(VF, VL.size() - I * VF))) {
+      auto *LI = dyn_cast<LoadInst>(V);
+      if (!LI)
+        continue;
+      if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
+        continue;
+      bool IsFound = false;
+      for (auto &Data : ClusteredLoads) {
+        if (LI->getParent() != Data.front().first->getParent())
+          continue;
+        std::optional<int> Dist =
+            getPointersDiff(LI->getType(), LI->getPointerOperand(),
+                            Data.front().first->getType(),
+                            Data.front().first->getPointerOperand(), DL, SE,
+                            /*StrictCheck=*/true);
+        if (Dist && all_of(Data, [&](const std::pair<LoadInst *, int> &Pair) {
+              IsFound |= Pair.first == LI;
+              return IsFound || Pair.second != *Dist;
+            })) {
+          if (!IsFound)
+            Data.emplace_back(LI, *Dist);
+          IsFound = true;
+          break;
+        }
+      }
+      if (!IsFound)
+        ClusteredLoads.emplace_back().emplace_back(LI, 0);
+    }
+    // If more than 2 vector loads required - rely on smart gather of loads,
+    // reshuffling of gathers does not know how to handle it.
+    if (count_if(ClusteredLoads,
+                 [](ArrayRef<std::pair<LoadInst *, int>> Loads) {
+                   return Loads.size() >= 2;
+                 }) > 2)
+      return false;
+  }
+  auto FindMatchingLoads =
+      [&](ArrayRef<std::pair<LoadInst *, int>> Loads,
+          SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>>
+              &GatheredLoads,
+          SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
+          int &Offset) {
+        if (Loads.empty())
+          return GatheredLoads.end();
+        LoadInst *LI = Loads.front().first;
+        for (auto [Idx, Data] : enumerate(GatheredLoads)) {
+          ToAdd.clear();
+          if (LI->getParent() != Data.front().first->getParent())
+            continue;
+          std::optional<int> Dist =
+              getPointersDiff(LI->getType(), LI->getPointerOperand(),
+                              Data.front().first->getType(),
+                              Data.front().first->getPointerOperand(), DL, SE,
+                              /*StrictCheck=*/true);
+          if (Dist) {
+            // Found matching gathered loads - check if all loads are unique or
+            // can be effectively vectorized.
+            unsigned NumUniques = 0;
+            for (auto [Idx, Pair] : enumerate(Loads)) {
+              bool Used = false;
+              if (none_of(Data,
+                          [&, &P = Pair](const std::pair<LoadInst *, int> &PD) {
+                            Used |= PD.first == P.first;
+                            return *Dist + P.second == PD.second;
+                          }) &&
+                  !Used) {
+                ++NumUniques;
+                ToAdd.insert(Idx);
+              }
+              if (Used)
+                Repeated.insert(Idx);
+            }
+            if (Loads.size() == NumUniques ||
+                (Loads.size() - NumUniques >= 2 &&
+                 Loads.size() - NumUniques >= Loads.size() / 2 &&
+                 PowerOf2Ceil(Data.size()) <
+                     PowerOf2Ceil(Data.size() + NumUniques))) {
+              Offset = *Dist;
+              return std::next(GatheredLoads.begin(), Idx);
+            }
+          }
+        }
+        ToAdd.clear();
+        return GatheredLoads.end();
+      };
+  for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
+    SetVector<unsigned> ToAdd, Repeated;
+    int Offset = 0;
+    auto *It = FindMatchingLoads(Data, GatheredLoads, ToAdd, Repeated, Offset);
+    if (It != GatheredLoads.end()) {
+      assert(!ToAdd.empty() && "Expected some elements to add.");
+      for (unsigned Idx : ToAdd)
+        It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
+    }
+    if (ToAdd.size() + Repeated.size() != Data.size()) {
+      LoadInst *LI = Data.front().first;
+      auto *It = AddNew ? GatheredLoads.end()
+                        : find_if(GatheredLoads,
+                                  [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
+                                    return PD.front().first->getParent() ==
+                                               LI->getParent() &&
+                                           PD.front().first->getType() ==
+                                               LI->getType();
+                                  });
+      if (It == GatheredLoads.end()) {
+        GatheredLoads.emplace_back();
+        It = std::next(GatheredLoads.begin(), GatheredLoads.size() - 1);
+      }
+      for (unsigned Idx : seq<unsigned>(0, Data.size())) {
+        if (ToAdd.contains(Idx) || Repeated.contains(Idx))
+          continue;
+        It->push_back(Data[Idx]);
+      }
+    }
+  }
+  return true;
+}
+
+void BoUpSLP::tryToVectorizeGatheredLoads() {
+  GatheredLoadsEntriesFirst = VectorizableTree.size();
+
+  // Sort loads by distance.
+  auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
+                       const std::pair<LoadInst *, int> &L2) {
+    return L1.second > L2.second;
+  };
+
+  auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
+                                    BoUpSLP::ValueSet &VectorizedLoads,
+                                    SmallVectorImpl<LoadInst *> &NonVectorized) {
+    SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
+    unsigned StartIdx = 0;
+    SmallVector<int> CandidateVFs;
+    if (VectorizeNonPowerOf2 && isPowerOf2_32(Loads.size() + 1))
+      CandidateVFs.push_back(Loads.size());
+    for (int NumElts = bit_floor(Loads.size()); NumElts > 1; NumElts /= 2) {
+      CandidateVFs.push_back(NumElts);
+      if (VectorizeNonPowerOf2 && NumElts > 2)
+        CandidateVFs.push_back(NumElts - 1);
+    }
+
+    for (int NumElts : CandidateVFs) {
+      for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E;
+           ++Cnt) {
+        ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(Cnt, NumElts);
+        if (VectorizedLoads.count(Slice.front()) ||
+            VectorizedLoads.count(Slice.back()))
+          continue;
+        // Check if it is profitable to try vectorizing gathered loads. It is
+        // profitable if we have more than 3 consecutive loads or if we have
+        // less but all users are vectorized or deleted.
+        bool AllowToVectorize =
+            NumElts >= 3 ||
+            any_of(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
+              return TE->State == TreeEntry::NeedToGather &&
+                     TE->Scalars.size() == 2 &&
+                     (equal(TE->Scalars, Slice) ||
+                      equal(TE->Scalars, reverse(Slice)));
+            });
+        // Check if it is profitable to vectorize 2-elements loads.
+        if (NumElts == 2) {
+          bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
+              Slice.front()->getType(), ElementCount::getFixed(NumElts));
+          auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
+            for (LoadInst *LI : Slice) {
+              // If single use/user - allow to vectorize.
+              if (LI->hasOneUse())
+                continue;
+              // 1. Check if number of uses equal number of users.
+              // 2. All users are deleted.
+              // 3. The load broadcasts are not allowed or the load is not
+              // broadcasted.
+              if (std::distance(LI->user_begin(), LI->user_end()) !=
+                  LI->getNumUses())
+                return false;
+              for (User *U : LI->users()) {
+                if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
+                  continue;
+                if (const TreeEntry *UTE = getTreeEntry(U)) {
+                  if (!IsLegalBroadcastLoad)
+                    // The broadcast is illegal - vectorize loads.
+                    continue;
+                  for (int I = 0, End = UTE->getNumOperands(); I < End; ++I) {
+                    if (all_of(UTE->getOperand(I),
+                               [LI](Value *V) { return V == LI; }))
+                      // Found legal broadcast - do not vectorize.
+                      return false;
+                  }
+                }
+              }
+            }
+            return true;
+          };
+          AllowToVectorize = CheckIfAllowed(Slice);
+        }
+        if (AllowToVectorize) {
+          SmallVector<Value *> PointerOps;
+          OrdersType CurrentOrder;
+          // Try to build vector load.
+          ArrayRef<Value *> Values(
+              reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
+          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
+                                            PointerOps);
+          if (LS != LoadsState::Gather) {
+            Results.emplace_back(Values, LS);
+            VectorizedLoads.insert(Slice.begin(), Slice.end());
+            // If we vectorized initial block, no need to try to vectorize it
+            // again.
+            if (Cnt == StartIdx)
+              StartIdx += NumElts;
+            Cnt += NumElts - 1;
+            continue;
+          }
+        }
+        // Check if the whole array was vectorized already - exit.
+        if (StartIdx >= Loads.size())
+          break;
+      }
+    }
+    for (LoadInst *LI : Loads) {
+      if (!VectorizedLoads.contains(LI))
+        NonVectorized.push_back(LI);
+    }
+    return Results;
+  };
+  auto ProcessGatheredLoads =
+      [&](ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
+        SmallVector<LoadInst *> NonVectorized;
+        for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
+          SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
+          SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
+          transform(
+              LoadsDists, OriginalLoads.begin(),
+              [](const std::pair<LoadInst *, int> &L) { return L.first; });
+          sort(LocalLoadsDists, LoadSorter);
+          SmallVector<LoadInst *> Loads(LocalLoadsDists.size());
+          transform(
+              LocalLoadsDists, Loads.begin(),
+              [](const std::pair<LoadInst *, int> &L) { return L.first; });
+          BoUpSLP::ValueSet VectorizedLoads;
+          SmallVector<LoadInst *> SortedNonVectorized;
+          SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
+              GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized);
+          if (all_of(Results,
+                     [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
+                       return P.second == LoadsState::ScatterVectorize;
+                     })) {
+            VectorizedLoads.clear();
+            SmallVector<LoadInst *> UnsortedNonVectorized;
+            SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
+                UnsortedResults = GetVectorizedRanges(
+                    OriginalLoads, VectorizedLoads, UnsortedNonVectorized);
+            if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
+              SortedNonVectorized.swap(UnsortedNonVectorized);
+              Results.swap(UnsortedResults);
+            }
+          }
+          for (auto [Slice, _] : Results) {
+            LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
+                              << Slice.size() << ")\n");
+
+            buildTree_rec(Slice, 0, EdgeInfo());
+          }
+          NonVectorized.append(SortedNonVectorized);
+        }
+        return NonVectorized;
+      };
+  SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(GatheredLoads);
+  SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
+  for (LoadInst *LI : NonVectorized) {
+    // Reinsert non-vectorized loads to other list of loads with the same
+    // base pointers.
+    gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
+                                    FinalGatheredLoads,
+                                    /*AddNew=*/false);
+  }
+  // Final attempt to vectorize non-vectorized loads.
+  (void)ProcessGatheredLoads(FinalGatheredLoads);
+  // If no new entries created, consider it as no gathered loads entries must be
+  // handled.
+  if (static_cast<unsigned>(GatheredLoadsEntriesFirst) ==
+      VectorizableTree.size())
+    GatheredLoadsEntriesFirst = NoGatheredLoads;
 }
 
 /// \return true if the specified list of values has only one instruction that
@@ -6333,6 +6805,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                                           !isConstant(V);
                                                  })) ||
           !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
+        if (UserTreeIdx.UserTE)
+          gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, *TTI,
+                                          GatheredLoads);
         if (DoNotFail && UniquePositions.size() > 1 &&
             NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
             all_of(UniqueValues, [=](Value *V) {
@@ -6387,6 +6862,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                       cast<Instruction>(S.MainOp)->getOpcode();
          })))) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+    if (UserTreeIdx.UserTE)
+      gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, *TTI, GatheredLoads);
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                    ReuseShuffleIndicies);
@@ -6475,6 +6952,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     return true;
   };
+  bool IsSplat = isSplat(VL);
   SmallVector<unsigned> SortedIndices;
   BasicBlock *BB = nullptr;
   bool IsScatterVectorizeUserTE =
@@ -6496,16 +6974,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
        BB &&
        sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
                        SortedIndices));
-  if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
+  if (!AreAllSameInsts || IsSplat || allConstant(VL) ||
+      (S.getOpcode() == Instruction::Load && UserTreeIdx.UserTE &&
+       TTI->getShuffleCost(
+           TTI::SK_ExtractSubvector,
+           FixedVectorType::get(S.OpValue->getType(), VL.size() * 2),
+           std::nullopt, TTI::TCK_RecipThroughput, VL.size(),
+           FixedVectorType::get(S.OpValue->getType(), VL.size())) <=
+           TTI->getMemoryOpCost(
+               Instruction::Load,
+               FixedVectorType::get(S.OpValue->getType(), VL.size()),
+               cast<LoadInst>(S.OpValue)->getAlign(),
+               cast<LoadInst>(S.OpValue)->getPointerAddressSpace())) ||
       (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
            S.OpValue) &&
        !all_of(VL, isVectorLikeInstWithConstOps)) ||
       NotProfitableForVectorization(VL)) {
-    LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
-    if (TryToFindDuplicates(S))
-      newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
-    return;
+    bool NeedToGather = true;
+    if (UserTreeIdx.UserTE)
+      NeedToGather &= gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, *TTI,
+                                                      GatheredLoads) ||
+                      S.getOpcode() != Instruction::Load;
+    if (NeedToGather) {
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
+      if (IsSplat || TryToFindDuplicates(S))
+        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
+                     ReuseShuffleIndicies);
+      return;
+    }
   }
 
   // We now know that this is a vector of instructions of the same type from
@@ -6514,7 +7010,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Check if this is a duplicate of another entry.
   if (TreeEntry *E = getTreeEntry(S.OpValue)) {
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
-    if (!E->isSame(VL)) {
+    if (GatheredLoadsEntriesFirst != NoGatheredLoads || !E->isSame(VL)) {
       auto It = MultiNodeScalars.find(S.OpValue);
       if (It != MultiNodeScalars.end()) {
         auto *TEIt = find_if(It->getSecond(),
@@ -6619,6 +7115,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   TreeEntry::EntryState State = getScalarsVectorizationState(
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
   if (State == TreeEntry::NeedToGather) {
+    if (UserTreeIdx.UserTE)
+      gatherPossiblyVectorizableLoads(*this, VL, *DL, *SE, *TTI, GatheredLoads);
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndicies);
     return;
@@ -8852,7 +9350,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         InstructionCost VecCost = VectorCost(CommonCost);
         // Check if the current node must be resized, if the parent node is not
         // resized.
-        if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
+        if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0 &&
+            (E->getOpcode() != Instruction::Load ||
+             !E->UserTreeIndices.empty())) {
           const EdgeInfo &EI = E->UserTreeIndices.front();
           if ((EI.UserTE->getOpcode() != Instruction::Select ||
                EI.EdgeIdx != 0) &&
@@ -9474,7 +9974,14 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
         VectorizableTree[0]->getVectorFactor() > 2)))
     return true;
 
-  if (VectorizableTree.size() != 2)
+  if (VectorizableTree.size() != 2 &&
+      GatheredLoadsEntriesFirst == NoGatheredLoads)
+    return false;
+  if (VectorizableTree.size() != 3 && GatheredLoadsEntriesFirst >= 2 &&
+      all_of(ArrayRef(VectorizableTree).drop_front(),
+             [](const std::unique_ptr<TreeEntry> &TE) {
+               return TE->Vectorize == TreeEntry::NeedToGather;
+             }))
     return false;
 
   // Handle splat and all-constants stores. Also try to vectorize tiny trees
@@ -9561,8 +10068,19 @@ bool BoUpSLP::isLoadCombineCandidate() const {
 }
 
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
+  bool AreAllLoadsGathers = [this]() {
+    return GatheredLoadsEntriesFirst >= 0 &&
+           all_of(make_range(std::next(VectorizableTree.begin(),
+                                       GatheredLoadsEntriesFirst),
+                             VectorizableTree.end()),
+                  [](const std::unique_ptr<TreeEntry> &TE) {
+                    return TE->State == TreeEntry::NeedToGather;
+                  });
+  }();
   // No need to vectorize inserts of gathered values.
-  if (VectorizableTree.size() == 2 &&
+  if (((VectorizableTree.size() == 2 &&
+        GatheredLoadsEntriesFirst == NoGatheredLoads) ||
+       (GatheredLoadsEntriesFirst == 2 && AreAllLoadsGathers)) &&
       isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
       VectorizableTree[1]->State == TreeEntry::NeedToGather &&
       (VectorizableTree[1]->getVectorFactor() <= 2 ||
@@ -9587,7 +10105,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
 
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
-  if (VectorizableTree.size() >= MinTreeSize)
+  if (VectorizableTree.size() >= MinTreeSize &&
+      (GatheredLoadsEntriesFirst == NoGatheredLoads ||
+       (GatheredLoadsEntriesFirst >= static_cast<int>(MinTreeSize) &&
+        !AreAllLoadsGathers)))
     return false;
 
   // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
@@ -9917,6 +10438,21 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
       }
     }
 
+    // Exclude cost of gather loads nodes which are not used. These nodes were
+    // built as part of the final attempt to vectorize gathered loads.
+    if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+        I >= static_cast<unsigned>(GatheredLoadsEntriesFirst) &&
+        TE.State == TreeEntry::NeedToGather && TE.UserTreeIndices.empty()) {
+      assert(all_of(TE.Scalars,
+                    [this](Value *V) {
+                      return (isa<LoadInst>(V) && MustGather.contains(V)) ||
+                             isa<Constant>(V) ||
+                             V->getType()->isPtrOrPtrVectorTy();
+                    }) &&
+             "Expected loads, pointers or constants only.");
+      continue;
+    }
+
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
     Cost += C;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
@@ -10495,7 +11031,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       VToTEs.insert(TEPtr);
     }
     if (const TreeEntry *VTE = getTreeEntry(V)) {
-      if (ForOrder) {
+      if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst) {
         if (VTE->State != TreeEntry::Vectorize) {
           auto It = MultiNodeScalars.find(V);
           if (It == MultiNodeScalars.end())
@@ -10976,6 +11512,14 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
     return FirstInst;
   };
 
+  // Set insertpoint for gathered loads to the very first load.
+  if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+      E->Idx >= GatheredLoadsEntriesFirst &&
+      E->State != TreeEntry::NeedToGather &&
+      E->getOpcode() == Instruction::Load) {
+    Res.second = FindFirstInst();
+    return *Res.second;
+  }
   // Set the insert point to the beginning of the basic block if the entry
   // should not be scheduled.
   if (doesNotNeedToSchedule(E->Scalars) ||
@@ -13086,6 +13630,20 @@ Value *BoUpSLP::vectorizeTree(
   else
     Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
 
+  // Emit gathered loads first to emit better code for the users of those
+  // gathered loads.
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+               TE->Idx >= GatheredLoadsEntriesFirst &&
+               (TE->State != TreeEntry::NeedToGather ||
+                !TE->UserTreeIndices.empty())) {
+      assert((!TE->UserTreeIndices.empty() ||
+              (TE->getOpcode() == Instruction::Load &&
+               TE->State != TreeEntry::NeedToGather)) &&
+             "Expected gathered load node.");
+      (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
+    }
+  }
   // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
   (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index f98d0ad0527c29..ce96d4bbb22029 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -340,12 +340,12 @@ entry:
 define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride) {
 ; CHECK-LABEL: @reduce_blockstrided4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]]
@@ -429,20 +429,16 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]])
-; CHECK-NEXT:    ret i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]])
+; CHECK-NEXT:    ret i32 [[TMP17]]
 ;
 entry:
   %idx.ext = sext i32 %off1 to i64
@@ -677,63 +673,60 @@ entry:
 define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) {
 ; CHECK-LABEL: @store_blockstrided3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1
 ; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
 ; CHECK-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
-; CHECK-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
+; CHECK-NEXT:    [[ADD18:%.*]] = add nsw i32 [[MUL]], 2
+; CHECK-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-NEXT:    [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
 ; CHECK-NEXT:    [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
-; CHECK-NEXT:    [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2
-; CHECK-NEXT:    [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64
-; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1
+; CHECK-NEXT:    [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
+; CHECK-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
-; CHECK-NEXT:    [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
 ; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, ptr [[Z:%.*]], i64 4
-; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 24
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[ARRAYIDX90:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 40
-; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4
+; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28
+; CHECK-NEXT:    [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
 ; CHECK-NEXT:    store i32 [[MUL73]], ptr [[Z]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
-; CHECK-NEXT:    store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
-; CHECK-NEXT:    store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4
-; CHECK-NEXT:    [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36
-; CHECK-NEXT:    store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4
+; CHECK-NEXT:    store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4
+; CHECK-NEXT:    store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP18]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -833,12 +826,12 @@ entry:
 define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride, ptr %dst0) {
 ; CHECK-LABEL: @store_blockstrided4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]]
@@ -1203,86 +1196,78 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[P2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
-; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]]
-; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <16 x i8> [[TMP22]] to <16 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw <16 x i32> [[TMP19]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <16 x i8> [[TMP25]], <16 x i8> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <16 x i8> [[TMP27]] to <16 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32>
+; CHECK-NEXT:    [[TMP33:%.*]] = sub nsw <16 x i32> [[TMP28]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = shl nsw <16 x i32> [[TMP33]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[TMP34]], [[TMP24]]
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i32> [[TMP35]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEXT:    [[TMP37:%.*]] = add nsw <16 x i32> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = sub nsw <16 x i32> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i32> [[TMP37]], <16 x i32> [[TMP38]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30, i32 5, i32 1, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <16 x i32> [[TMP39]], <16 x i32> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP41:%.*]] = add nsw <16 x i32> [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <16 x i32> [[TMP41]], <16 x i32> [[TMP42]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30, i32 5, i32 1, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28>
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
 ; CHECK-NEXT:    [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
-; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP60:%.*]] = lshr <16 x i32> [[TMP59]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP61:%.*]] = and <16 x i32> [[TMP60]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw <16 x i32> [[TMP61]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP63:%.*]] = add <16 x i32> [[TMP62]], [[TMP59]]
-; CHECK-NEXT:    [[TMP64:%.*]] = xor <16 x i32> [[TMP63]], [[TMP62]]
-; CHECK-NEXT:    [[TMP65:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP64]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP65]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP65]], 16
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP52:%.*]] = lshr <16 x i32> [[TMP51]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP53:%.*]] = and <16 x i32> [[TMP52]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP54:%.*]] = mul nuw <16 x i32> [[TMP53]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP55:%.*]] = add <16 x i32> [[TMP54]], [[TMP51]]
+; CHECK-NEXT:    [[TMP56:%.*]] = xor <16 x i32> [[TMP55]], [[TMP54]]
+; CHECK-NEXT:    [[TMP57:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP56]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP57]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP57]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index f85f658fed4d5c..14af5e529bb981 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -14,168 +14,164 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-LABEL: @straight(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i16> [[TMP8]], <64 x i16> [[TMP9]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i16> [[TMP16]], <64 x i16> [[TMP17]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <64 x i16> [[TMP18]], <64 x i16> [[TMP19]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <64 x i16> [[TMP20]], <64 x i16> [[TMP21]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <64 x i16> [[TMP22]] to <64 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP23]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP23]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul nuw nsw <64 x i32> [[TMP23]], [[TMP23]]
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP23]], i32 2
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP23]], i32 3
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP28]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP23]], i32 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP23]], i32 5
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP23]], i32 6
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP31]]
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP23]], i32 7
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP32]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP23]], i32 8
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP33]]
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP23]], i32 9
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP34]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP23]], i32 10
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP35]]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP23]], i32 11
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP36]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP23]], i32 12
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP37]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP23]], i32 13
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP23]], i32 14
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP39]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP23]], i32 15
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP23]], i32 16
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP23]], i32 17
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP42]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP23]], i32 18
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP43]]
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP23]], i32 19
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP44]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP23]], i32 20
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP23]], i32 21
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP23]], i32 22
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP23]], i32 23
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP23]], i32 24
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP23]], i32 25
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP23]], i32 26
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP23]], i32 27
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP23]], i32 28
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP53]]
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP23]], i32 29
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP54]]
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP23]], i32 30
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP23]], i32 31
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP56]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP23]], i32 32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP23]], i32 33
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP58]]
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP23]], i32 34
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP59]]
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP23]], i32 35
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP60]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP23]], i32 36
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP61]]
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP23]], i32 37
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP62]]
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP23]], i32 38
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP63]]
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP23]], i32 39
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP64]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP23]], i32 40
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP65]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP23]], i32 41
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP66]]
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP23]], i32 42
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP67]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP23]], i32 43
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP68]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP23]], i32 44
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP69]]
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP23]], i32 45
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP70]]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP23]], i32 46
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP71]]
-; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP23]], i32 47
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP72]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP23]], i32 48
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP73]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP23]], i32 49
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP74]]
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP23]], i32 50
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP75]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP23]], i32 51
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP23]], i32 52
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP77]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP23]], i32 53
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP78]]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP23]], i32 54
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP79]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP23]], i32 55
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP23]], i32 56
-; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP81]]
-; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <64 x i32> [[TMP23]], i32 57
-; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP82]]
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <64 x i32> [[TMP23]], i32 58
-; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP83]]
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <64 x i32> [[TMP23]], i32 59
-; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP84]]
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <64 x i32> [[TMP23]], i32 60
-; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP85]]
-; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <64 x i32> [[TMP23]], i32 61
-; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP86]]
-; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <64 x i32> [[TMP23]], i32 62
-; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP87]]
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <64 x i32> [[TMP23]], i32 63
-; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP88]]
-; CHECK-NEXT:    [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP26]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP8]], <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i16> [[TMP13]], <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP14]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i16> [[TMP16]], <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP17]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <64 x i16> [[TMP18]] to <64 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <64 x i32> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <64 x i32> [[TMP19]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw nsw <64 x i32> [[TMP19]], [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <64 x i32> [[TMP19]], i32 2
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP19]], i32 3
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP19]], i32 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP25]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <64 x i32> [[TMP19]], i32 5
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP19]], i32 6
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP27]]
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP19]], i32 7
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP28]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP19]], i32 8
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP19]], i32 9
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP19]], i32 10
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP31]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP19]], i32 11
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP32]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP19]], i32 12
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP33]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP19]], i32 13
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP34]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP19]], i32 14
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP35]]
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP19]], i32 15
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP36]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP19]], i32 16
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP37]]
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP19]], i32 17
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP38]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP19]], i32 18
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP39]]
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP19]], i32 19
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP40]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP19]], i32 20
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP19]], i32 21
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP42]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP19]], i32 22
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP19]], i32 23
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP44]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP19]], i32 24
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP45]]
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP19]], i32 25
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP19]], i32 26
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP47]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP19]], i32 27
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP48]]
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP19]], i32 28
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP49]]
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP19]], i32 29
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP19]], i32 30
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP51]]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP19]], i32 31
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP52]]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP19]], i32 32
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP53]]
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP19]], i32 33
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP54]]
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP19]], i32 34
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP55]]
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP19]], i32 35
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP56]]
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP19]], i32 36
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP57]]
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP19]], i32 37
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP58]]
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP19]], i32 38
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP19]], i32 39
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP60]]
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP19]], i32 40
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP61]]
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP19]], i32 41
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP62]]
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP19]], i32 42
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP63]]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP19]], i32 43
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP64]]
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP19]], i32 44
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP65]]
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP19]], i32 45
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP66]]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP19]], i32 46
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP67]]
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP19]], i32 47
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP68]]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP19]], i32 48
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP69]]
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP19]], i32 49
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP70]]
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP19]], i32 50
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP71]]
+; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP19]], i32 51
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP72]]
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP19]], i32 52
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP73]]
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP19]], i32 53
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP74]]
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP19]], i32 54
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP75]]
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP19]], i32 55
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP76]]
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP19]], i32 56
+; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP77]]
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP19]], i32 57
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP78]]
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP19]], i32 58
+; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP79]]
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP19]], i32 59
+; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP80]]
+; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP19]], i32 60
+; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP81]]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <64 x i32> [[TMP19]], i32 61
+; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP82]]
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <64 x i32> [[TMP19]], i32 62
+; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP83]]
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <64 x i32> [[TMP19]], i32 63
+; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP84]]
+; CHECK-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP22]])
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP89]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP85]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index 95aa40f664c0ce..d08020f9c58469 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -17,20 +17,14 @@
 
 define void @s116_modified(ptr %a) {
 ; CHECK-LABEL: @s116_modified(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
-; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[LD4:%.*]] = load float, ptr [[GEP4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[LD4]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
index 67746f2cbf5d22..d4dbb8bbfaf0d4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
@@ -1,8 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POWER-OF-2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck %s
 
 define void @vec3_vectorize_call(ptr %Colour, float %0) {
+; NON-POWER-OF-2-LABEL: @vec3_vectorize_call(
+; NON-POWER-OF-2-NEXT:  entry:
+; NON-POWER-OF-2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
+; NON-POWER-OF-2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
+; NON-POWER-OF-2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
+; NON-POWER-OF-2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POWER-OF-2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
+; NON-POWER-OF-2-NEXT:    ret void
+;
 ; CHECK-LABEL: @vec3_vectorize_call(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
@@ -28,6 +38,19 @@ entry:
 }
 
 define void @vec3_fmuladd_64(ptr %Colour, double %0) {
+; NON-POWER-OF-2-LABEL: @vec3_fmuladd_64(
+; NON-POWER-OF-2-NEXT:  entry:
+; NON-POWER-OF-2-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
+; NON-POWER-OF-2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; NON-POWER-OF-2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; NON-POWER-OF-2-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = fptrunc <2 x double> [[TMP3]] to <2 x float>
+; NON-POWER-OF-2-NEXT:    store <2 x float> [[TMP4]], ptr [[COLOUR]], align 4
+; NON-POWER-OF-2-NEXT:    [[TMP5:%.*]] = call double @llvm.fmuladd.f64(double [[TMP0]], double 0.000000e+00, double 0.000000e+00)
+; NON-POWER-OF-2-NEXT:    [[CONV82:%.*]] = fptrunc double [[TMP5]] to float
+; NON-POWER-OF-2-NEXT:    store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
+; NON-POWER-OF-2-NEXT:    ret void
+;
 ; CHECK-LABEL: @vec3_fmuladd_64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index b59659ca75eb24..32bc43278bf8e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -241,32 +241,31 @@ entry:
 define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-LABEL: @select_uniform_ugt_16xi8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i8 8
 ; CHECK-NEXT:    [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1
 ; CHECK-NEXT:    [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9
+; CHECK-NEXT:    [[GEP_13:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 13
+; CHECK-NEXT:    [[GEP_15:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 15
+; CHECK-NEXT:    [[L_15:%.*]] = load i8, ptr [[GEP_15]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9
-; CHECK-NEXT:    [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10
-; CHECK-NEXT:    [[L_10:%.*]] = load i8, ptr [[GEP_10]], align 1
-; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11
-; CHECK-NEXT:    [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
-; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]]
-; CHECK-NEXT:    store <16 x i8> [[TMP13]], ptr [[PTR]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_9]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i8>, ptr [[GEP_13]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 9, i32 10, i32 11, i32 12, i32 16, i32 17, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[L_15]], i32 15
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i8> [[TMP11]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP12]], <16 x i8> [[TMP11]], <16 x i8> [[TMP14]]
+; CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr [[PTR]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index d87bdfe2689916..6edcbe9d534788 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -7,159 +7,180 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
 ; CHECK-NEXT:    [[CONV1:%.*]] = zext i8 [[TMP0]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 4, i64 6>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
-; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
-; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
-; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i8> [[TMP5]] to <2 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <2 x i32> <i32 4, i32 6>
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <2 x i8> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <2 x i32> <i32 4, i32 6>
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP13]] to <2 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = sub <2 x i32> [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[TMP42:%.*]] = shl <2 x i32> [[TMP15]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP17:%.*]] = add <2 x i32> [[TMP42]], [[TMP10]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP26:%.*]] = add <2 x i32> [[TMP25]], [[TMP19]]
-; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; CHECK-NEXT:    [[TMP27:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT:    [[TMP31:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = sub <2 x i32> [[TMP19]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <2 x i32> <i32 5, i32 7>
+; CHECK-NEXT:    [[TMP24:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> poison, <2 x i32> <i32 5, i32 7>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
+; CHECK-NEXT:    [[TMP27:%.*]] = sub <2 x i32> [[TMP24]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = shl <2 x i32> [[TMP27]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP29:%.*]] = add <2 x i32> [[TMP28]], [[TMP22]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sub <2 x i32> [[TMP17]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> getelementptr (i8, <4 x ptr> zeroinitializer, <4 x i64> <i64 4, i64 5, i64 0, i64 1>), i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison)
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT:    [[TMP34:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP35:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
-; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
+; CHECK-NEXT:    [[TMP43:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
+; CHECK-NEXT:    [[TMP44:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP38:%.*]] = zext <2 x i8> [[TMP44]] to <2 x i32>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <2 x i32> [[TMP38]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP31]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP41:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
-; CHECK-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]]
-; CHECK-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT:    [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]]
-; CHECK-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
-; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[TMP37]], [[TMP43]]
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX20_3]], i32 0
 ; CHECK-NEXT:    [[TMP46:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP45]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP47:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32>
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
-; CHECK-NEXT:    [[TMP49:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP48]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
-; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 6, i64 4>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP51]]
-; CHECK-NEXT:    [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32>
-; CHECK-NEXT:    [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 3, i64 1>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32>
-; CHECK-NEXT:    [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT:    [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 7, i64 5>), i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
-; CHECK-NEXT:    [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP52:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i16>
+; CHECK-NEXT:    [[TMP48:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_3]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i16>
+; CHECK-NEXT:    [[TMP50:%.*]] = sub <2 x i16> [[TMP52]], [[TMP49]]
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i16> [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP55]] to i32
+; CHECK-NEXT:    [[SHL30_3:%.*]] = shl i32 [[TMP51]], 16
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <2 x i16> [[TMP50]], i32 0
+; CHECK-NEXT:    [[TMP66:%.*]] = sext i16 [[TMP56]] to i32
+; CHECK-NEXT:    [[ADD31_3:%.*]] = add i32 [[SHL30_3]], [[TMP66]]
+; CHECK-NEXT:    [[ARRAYIDX32_3:%.*]] = getelementptr i8, ptr null, i64 3
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr i8, ptr null, i64 3
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX32_3]], i32 0
+; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP53]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i16>
+; CHECK-NEXT:    [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX34_3]], i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i16>
+; CHECK-NEXT:    [[TMP60:%.*]] = sub <2 x i16> [[TMP57]], [[TMP59]]
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <2 x i16> [[TMP60]], i32 1
+; CHECK-NEXT:    [[TMP62:%.*]] = sext i16 [[TMP61]] to i32
+; CHECK-NEXT:    [[SHL42_3:%.*]] = shl i32 [[TMP62]], 16
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <2 x i16> [[TMP60]], i32 0
+; CHECK-NEXT:    [[TMP64:%.*]] = sext i16 [[TMP63]] to i32
+; CHECK-NEXT:    [[ADD43_3:%.*]] = add i32 [[SHL42_3]], [[TMP64]]
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
 ; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
+; CHECK-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP65]], [[TMP75]]
+; CHECK-NEXT:    [[SUB47_3:%.*]] = sub i32 [[ADD31_3]], [[ADD43_3]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP29]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP17]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP47:%.*]] = add <2 x i32> [[TMP67]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <2 x i32> [[TMP29]], i32 [[ADD43_3]], i32 0
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[ADD31_3]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP71]]
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
+; CHECK-NEXT:    [[CONV:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV]], [[TMP79]]
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
+; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP76]], [[TMP86]]
+; CHECK-NEXT:    [[TMP77:%.*]] = sub <2 x i32> [[TMP47]], [[TMP72]]
+; CHECK-NEXT:    [[TMP91:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[SUB47_3]], i32 0
+; CHECK-NEXT:    [[TMP92:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x i32> [[TMP92]], i32 [[SUB45_3]], i32 0
+; CHECK-NEXT:    [[TMP81:%.*]] = add <2 x i32> [[TMP91]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = sub <2 x i32> [[TMP80]], [[TMP91]]
 ; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0
+; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP83]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <2 x i32> [[TMP81]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <2 x i32> [[TMP81]], i32 1
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP84]], [[TMP85]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[TMP85]], [[TMP84]]
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15
 ; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
 ; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <2 x i32> [[TMP77]], i32 0
+; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP101]], [[TMP111]]
 ; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP82]], i32 0
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP82]], i32 1
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP89]], [[TMP90]]
+; CHECK-NEXT:    [[TMP228:%.*]] = sub i32 [[TMP90]], [[TMP89]]
 ; CHECK-NEXT:    [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15
 ; CHECK-NEXT:    [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
 ; CHECK-NEXT:    [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
 ; CHECK-NEXT:    [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
-; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
-; CHECK-NEXT:    [[TMP81:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP80]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
-; CHECK-NEXT:    [[TMP83:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP3]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
-; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP86:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP84]], [[TMP86]]
+; CHECK-NEXT:    [[TMP93:%.*]] = load <8 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP94:%.*]] = shufflevector <8 x i8> [[TMP93]], <8 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; CHECK-NEXT:    [[TMP114:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP114]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <8 x i8> [[TMP93]], <8 x i8> poison, <2 x i32> <i32 4, i32 6>
+; CHECK-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP98]], [[TMP117]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT:    [[TMP91:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP92:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32>
-; CHECK-NEXT:    [[TMP93:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP9]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP92]], [[TMP94]]
+; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <8 x i8> [[TMP93]], <8 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT:    [[TMP119:%.*]] = shufflevector <4 x i8> [[TMP114]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32>
+; CHECK-NEXT:    [[TMP120:%.*]] = shufflevector <8 x i8> [[TMP93]], <8 x i8> poison, <2 x i32> <i32 5, i32 7>
+; CHECK-NEXT:    [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP106]], [[TMP121]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
-; CHECK-NEXT:    [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
+; CHECK-NEXT:    [[TMP112:%.*]] = sub <2 x i32> [[TMP97]], [[TMP118]]
+; CHECK-NEXT:    [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP112]]
 ; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT:    [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]]
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP100]], [[TMP113]]
+; CHECK-NEXT:    [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP115]]
 ; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
 ; CHECK-NEXT:    [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
 ; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
 ; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
 ; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]]
+; CHECK-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP238]], [[TMP143]]
+; CHECK-NEXT:    [[TMP122:%.*]] = extractelement <2 x i32> [[TMP105]], i32 0
 ; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
+; CHECK-NEXT:    [[TMP173:%.*]] = add i32 [[TMP108]], [[TMP122]]
+; CHECK-NEXT:    [[SUB60:%.*]] = sub i32 [[TMP122]], [[TMP108]]
 ; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15
 ; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
 ; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
@@ -168,49 +189,46 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
 ; CHECK-NEXT:    [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <2 x i8> [[TMP111]], i8 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0
-; CHECK-NEXT:    [[TMP115:%.*]] = shufflevector <2 x ptr> [[TMP114]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP116:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP116]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
-; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0
-; CHECK-NEXT:    [[TMP120:%.*]] = shufflevector <2 x ptr> [[TMP119]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP121:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 4, i64 6>
-; CHECK-NEXT:    [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP121]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP123]]
-; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
+; CHECK-NEXT:    [[TMP126:%.*]] = load <8 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP127:%.*]] = shufflevector <8 x i8> [[TMP126]], <8 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP129:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP130:%.*]] = shufflevector <4 x i8> [[TMP129]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
-; CHECK-NEXT:    [[TMP132:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 5, i64 7>
-; CHECK-NEXT:    [[TMP133:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP132]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
-; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
+; CHECK-NEXT:    [[TMP132:%.*]] = shufflevector <8 x i8> [[TMP126]], <8 x i8> poison, <2 x i32> <i32 4, i32 6>
+; CHECK-NEXT:    [[TMP133:%.*]] = zext <2 x i8> [[TMP132]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP131]], [[TMP133]]
+; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP142:%.*]] = shufflevector <8 x i8> [[TMP126]], <8 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP144:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32>
+; CHECK-NEXT:    [[TMP138:%.*]] = shufflevector <4 x i8> [[TMP129]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = shufflevector <8 x i8> [[TMP126]], <8 x i8> poison, <2 x i32> <i32 5, i32 7>
+; CHECK-NEXT:    [[TMP141:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP139]], [[TMP141]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]]
-; CHECK-NEXT:    [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
+; CHECK-NEXT:    [[TMP145:%.*]] = sub <2 x i32> [[TMP137]], [[TMP144]]
+; CHECK-NEXT:    [[TMP150:%.*]] = add <2 x i32> [[TMP136]], [[TMP145]]
 ; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
-; CHECK-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT:    [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
-; CHECK-NEXT:    [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]]
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
+; CHECK-NEXT:    [[TMP148:%.*]] = sub <2 x i32> [[TMP140]], [[TMP134]]
+; CHECK-NEXT:    [[TMP149:%.*]] = add <2 x i32> [[TMP125]], [[TMP148]]
+; CHECK-NEXT:    [[TMP257:%.*]] = add <2 x i32> [[TMP150]], [[TMP149]]
+; CHECK-NEXT:    [[TMP151:%.*]] = sub <2 x i32> [[TMP149]], [[TMP150]]
+; CHECK-NEXT:    [[TMP156:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
 ; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1
-; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP156]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP156]], [[TMP146]]
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0
+; CHECK-NEXT:    [[TMP155:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1
+; CHECK-NEXT:    [[TMP174:%.*]] = add i32 [[TMP155]], [[TMP157]]
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP157]], [[TMP155]]
 ; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[TMP147:%.*]] = lshr <2 x i32> [[TMP110]], <i32 15, i32 15>
-; CHECK-NEXT:    [[TMP148:%.*]] = and <2 x i32> [[TMP147]], <i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], <i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP152:%.*]] = lshr <2 x i32> [[TMP110]], <i32 15, i32 15>
+; CHECK-NEXT:    [[TMP153:%.*]] = and <2 x i32> [[TMP152]], <i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP154:%.*]] = mul <2 x i32> [[TMP153]], <i32 65535, i32 65535>
 ; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
 ; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
@@ -218,7 +236,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
 ; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP83]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
 ; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
@@ -228,29 +246,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; CHECK-NEXT:    [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1
-; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1
-; CHECK-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
-; CHECK-NEXT:    [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP155]]
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP158]], [[TMP157]]
-; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
-; CHECK-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP163]], [[TMP156]]
-; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
-; CHECK-NEXT:    [[TMP174:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[TMP174]], [[TMP173]]
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP174]], [[TMP173]]
 ; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP173]], [[TMP174]]
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB106_1]]
+; CHECK-NEXT:    [[SUB106_4:%.*]] = sub i32 [[SUB106_1]], [[SUB102_1]]
 ; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
 ; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]]
-; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> [[TMP151]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP168:%.*]] = and <2 x i32> [[TMP167]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], <i32 65535, i32 65535>
@@ -263,7 +265,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP283:%.*]] = shufflevector <2 x i32> [[TMP282]], <2 x i32> [[TMP211]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP283]]
 ; CHECK-NEXT:    [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]]
-; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
+; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_4]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP108]]
 ; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
 ; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP178]], i32 0
@@ -271,22 +273,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_2]], i32 0
-; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> [[TMP165]], i32 [[CONV]], i32 0
-; CHECK-NEXT:    [[TMP184:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
-; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]]
-; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0
-; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
-; CHECK-NEXT:    [[TMP190:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP189]], [[TMP188]]
-; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP184]], i32 1
-; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1
-; CHECK-NEXT:    [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
-; CHECK-NEXT:    [[TMP194:%.*]] = sub <2 x i32> [[TMP184]], [[TMP187]]
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]]
+; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP77]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP175:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[SUB51]], i32 1
+; CHECK-NEXT:    [[TMP176:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[SUB51_1]], i32 1
+; CHECK-NEXT:    [[TMP194:%.*]] = sub <2 x i32> [[TMP175]], [[TMP176]]
 ; CHECK-NEXT:    [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
 ; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
@@ -300,7 +291,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP215]]
 ; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]]
 ; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP248]]
+; CHECK-NEXT:    [[TMP266:%.*]] = add <2 x i32> [[TMP154]], [[TMP248]]
 ; CHECK-NEXT:    [[TMP267:%.*]] = xor <2 x i32> [[TMP266]], [[TMP110]]
 ; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15
 ; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
@@ -308,26 +299,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP238]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP267]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP206]]
-; CHECK-NEXT:    [[TMP207:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]]
+; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP267]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP189]]
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1
+; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP190]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP221:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0
-; CHECK-NEXT:    [[TMP222:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0
-; CHECK-NEXT:    [[TMP210:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
-; CHECK-NEXT:    [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP225]], [[TMP212]]
-; CHECK-NEXT:    [[TMP214:%.*]] = extractelement <2 x i32> [[TMP210]], i32 0
-; CHECK-NEXT:    [[TMP227:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
-; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP227]], [[TMP214]]
-; CHECK-NEXT:    [[TMP217:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1
-; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP218]], [[TMP217]]
-; CHECK-NEXT:    [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP226]]
+; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[SUB59_1]], [[SUB60]]
+; CHECK-NEXT:    [[TMP229:%.*]] = sub i32 [[SUB60]], [[SUB59_1]]
 ; CHECK-NEXT:    [[TMP274:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
 ; CHECK-NEXT:    [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP274]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -335,8 +313,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP276:%.*]] = add <2 x i32> [[TMP275]], [[TMP224]]
 ; CHECK-NEXT:    [[TMP277:%.*]] = sub <2 x i32> [[TMP275]], [[TMP224]]
 ; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP276]], <2 x i32> [[TMP277]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP228:%.*]] = extractelement <2 x i32> [[TMP220]], i32 0
-; CHECK-NEXT:    [[TMP229:%.*]] = extractelement <2 x i32> [[TMP220]], i32 1
 ; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]]
 ; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]]
 ; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_3]]
@@ -352,8 +328,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP287]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]]
+; CHECK-NEXT:    [[TMP204:%.*]] = extractelement <2 x i32> [[TMP287]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP204]]
 ; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP287]], i32 1
 ; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index 1e7eb4a4167242..b59b78ca43a2e6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -955,22 +955,18 @@ declare i32 @llvm.abs.i32(i32, i1)
 
 define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {
 ; CHECK-LABEL: @stride_sum_abs_diff(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]]
-; CHECK-NEXT:    [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]]
+; CHECK-NEXT:    [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]]
+; CHECK-NEXT:    [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true)
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
-; CHECK-NEXT:    ret i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP7]], i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %x.0 = load i32, ptr %p
   %y.0 = load i32, ptr %q
@@ -1010,12 +1006,10 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   %x.0 = load i8, ptr %p, align 1
@@ -1059,12 +1053,10 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
-; CHECK-NEXT:    ret i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
   entry:
   %0 = load i8, ptr %x, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
index ff06bdc0e84467..a44b73e42bf2bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -12,13 +12,12 @@ define void @foo() local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 0), align 4
 ; CHECK-NEXT:    [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[ADD277]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> undef, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[ARRAYIDX372]], align 4
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
index 66bfe056831570..01e028cac0c445 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -3,9 +3,9 @@
 
 define void @i64_simplified(ptr noalias %st, ptr noalias %ld) {
 ; CHECK-LABEL: @i64_simplified(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[LD:%.*]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], ptr [[ST:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[LD:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[ST:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx1 = getelementptr inbounds i64, ptr %ld, i64 1
@@ -26,9 +26,9 @@ define void @i64_simplified(ptr noalias %st, ptr noalias %ld) {
 
 define void @i64_simplifiedi_reversed(ptr noalias %st, ptr noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_reversed(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[LD:%.*]], align 8
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], ptr [[ST:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[LD:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[ST:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx1 = getelementptr inbounds i64, ptr %ld, i64 1
@@ -49,17 +49,11 @@ define void @i64_simplifiedi_reversed(ptr noalias %st, ptr noalias %ld) {
 
 define void @i64_simplifiedi_extract(ptr noalias %st, ptr noalias %ld) {
 ; CHECK-LABEL: @i64_simplifiedi_extract(
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[LD:%.*]], i64 1
-; CHECK-NEXT:    [[T0:%.*]] = load i64, ptr [[LD]], align 8
-; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[ST:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr [[ST]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, ptr [[ST]], i64 3
-; CHECK-NEXT:    store i64 [[T0]], ptr [[ST]], align 8
-; CHECK-NEXT:    store i64 [[T0]], ptr [[ARRAYIDX3]], align 8
-; CHECK-NEXT:    store i64 [[T0]], ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    store i64 [[T1]], ptr [[ARRAYIDX5]], align 8
-; CHECK-NEXT:    store i64 [[T1]], ptr [[LD]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[LD:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[ST:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    store i64 [[TMP3]], ptr [[LD]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx1 = getelementptr inbounds i64, ptr %ld, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
index 05511f843a68fa..332758a7b8b340 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -6,21 +6,22 @@
 define { i64, i64 } @patatino(double %arg) {
 ; CHECK-LABEL: @patatino(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr @global, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 2), align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr @global, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[ARG:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fptosi <2 x double> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP13]], 1
 ; CHECK-NEXT:    ret { i64, i64 } [[T17]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 94534274cab2ff..b119beff7500ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -301,14 +301,14 @@ define void @reorder_alt_subTree() #0 {
 
 define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %d) {
 ; CHECK-LABEL: @reorder_alt_rightsubTree(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[D:%.*]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[D:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %1 = load double, ptr %a
@@ -332,24 +332,14 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re
 
 define void @vec_shuff_reorder() #0 {
 ; CHECK-LABEL: @vec_shuff_reorder(
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @fb, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @fa, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    store <4 x float> [[TMP17]], ptr @fc, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @fb, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @fa, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP7]], ptr @fc, align 4
 ; CHECK-NEXT:    ret void
 ;
   %1 = load float, ptr @fb, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
index 889f5a95c81d69..524d97b32d47d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll
@@ -3,23 +3,23 @@
 
 define void @test(ptr %0, ptr %1, ptr %2) {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i32> <i32 0, i32 0, i32 undef, i32 0>, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP10]], <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP10]], <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 2, i32 0, i32 1, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP2:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %4 = load i32, ptr %1, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll
index 6077c376251ef3..126d986e7ec3f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll
@@ -3,16 +3,16 @@
 
 define void @test() {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr undef, i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr undef, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0
 ; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr null, align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 3, i32 poison>
 ; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr null, align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -36,16 +36,18 @@ define void @test() {
 
 define void @test1() {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr undef, i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr undef, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr null, align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr null, align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP1]], <2 x i32> <i32 0, i32 5>
+; CHECK-NEXT:    store <2 x float> [[TMP9]], ptr null, align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP10]], ptr null, align 4
 ; CHECK-NEXT:    ret void
 ;
   %1 = getelementptr inbounds float, ptr undef, i32 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 4cc3c1241b56df..560ddc80849482 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -837,21 +837,18 @@ define i32 @maxi8_mutiple_uses(i32) {
 ; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
 ; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
 ; THRESH-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; THRESH-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
-; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
-; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1
-; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
-; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1
-; THRESH-NEXT:    [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
-; THRESH-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
-; THRESH-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; THRESH-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]]
-; THRESH-NEXT:    [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; THRESH-NEXT:    store i32 [[TMP19]], ptr @var, align 8
+; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
+; THRESH-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0
+; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1
+; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]]
+; THRESH-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]
+; THRESH-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; THRESH-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]]
+; THRESH-NEXT:    [[TMP16:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; THRESH-NEXT:    store i32 [[TMP16]], ptr @var, align 8
 ; THRESH-NEXT:    ret i32 [[OP_RDX5]]
 ;
   %2 = load i32, ptr @arr, align 16
@@ -1021,19 +1018,16 @@ define i32 @maxi8_wrong_parent(i32) {
 ; THRESH:       pp:
 ; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
 ; THRESH-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; THRESH-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
-; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
-; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1
-; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
-; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1
-; THRESH-NEXT:    [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
-; THRESH-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
-; THRESH-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; THRESH-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]]
+; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
+; THRESH-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0
+; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1
+; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]]
+; THRESH-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]
+; THRESH-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; THRESH-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]]
 ; THRESH-NEXT:    ret i32 [[OP_RDX5]]
 ;
   %2 = load i32, ptr @arr, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 9a41c1dc5de225..ddd5f6c61ad2aa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s
 
 ;unsigned load_le32(unsigned char *data) {
 ;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
@@ -51,24 +51,14 @@ entry:
 }
 
 define <4 x float> @PR16739_byref(ptr nocapture readonly dereferenceable(16) %x) {
-; AVX2-LABEL: @PR16739_byref(
-; AVX2-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2
-; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4
-; AVX2-NEXT:    [[X2:%.*]] = load float, ptr [[GEP2]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2
-; AVX2-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
-; AVX2-NEXT:    ret <4 x float> [[I3]]
-;
-; AVX512-LABEL: @PR16739_byref(
-; AVX512-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 1
-; AVX512-NEXT:    [[X0:%.*]] = load float, ptr [[X]], align 4
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; AVX512-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX512-NEXT:    [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; AVX512-NEXT:    ret <4 x float> [[TMP3]]
+; CHECK-LABEL: @PR16739_byref(
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %gep1 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 1
   %gep2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
@@ -100,21 +90,14 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
-; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2
-; CHECK-NEXT:    [[T4:%.*]] = load i64, ptr [[T2]], align 8
-; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
-; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0
-; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
-; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[T15]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[T8]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[TMP6]]
 ;
   %t1 = load i64, ptr %x, align 16
   %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index bc8e6626e55084..1a0d6e4f14d280 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s
 
 ;unsigned load_le32(unsigned char *data) {
 ;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
@@ -51,24 +51,14 @@ entry:
 }
 
 define <4 x float> @PR16739_byref(ptr nocapture readonly dereferenceable(16) %x) {
-; AVX2-LABEL: @PR16739_byref(
-; AVX2-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2
-; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4
-; AVX2-NEXT:    [[X2:%.*]] = load float, ptr [[GEP2]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2
-; AVX2-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
-; AVX2-NEXT:    ret <4 x float> [[I3]]
-;
-; AVX512-LABEL: @PR16739_byref(
-; AVX512-NEXT:    [[GEP1:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 1
-; AVX512-NEXT:    [[X0:%.*]] = load float, ptr [[X]], align 4
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; AVX512-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX512-NEXT:    [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; AVX512-NEXT:    ret <4 x float> [[TMP3]]
+; CHECK-LABEL: @PR16739_byref(
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load float, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
 ;
   %gep1 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 1
   %gep2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
@@ -100,21 +90,14 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
-; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2
-; CHECK-NEXT:    [[T4:%.*]] = load i64, ptr [[T2]], align 8
-; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
-; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
-; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
-; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[T15]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[T8]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i64> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[TMP6]]
 ;
   %t1 = load i64, ptr %x, align 16
   %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll
index 8d37665224e7ee..9c00f45b913b5b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll
@@ -7,24 +7,22 @@
 
 define <2 x i64> @load_00123456(ptr nocapture noundef readonly %data) {
 ; SSE-LABEL: @load_00123456(
-; SSE-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 1
-; SSE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 2
+; SSE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 2
 ; SSE-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3
 ; SSE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 4
 ; SSE-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 5
 ; SSE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 6
-; SSE-NEXT:    [[T0:%.*]] = load i16, ptr [[DATA]], align 2
-; SSE-NEXT:    [[T1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[DATA]], align 2
 ; SSE-NEXT:    [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
 ; SSE-NEXT:    [[T3:%.*]] = load i16, ptr [[ARRAYIDX3]], align 2
 ; SSE-NEXT:    [[T4:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
 ; SSE-NEXT:    [[T5:%.*]] = load i16, ptr [[ARRAYIDX5]], align 2
 ; SSE-NEXT:    [[T6:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2
-; SSE-NEXT:    [[VECINIT0_I_I:%.*]] = insertelement <8 x i16> undef, i16 [[T0]], i64 0
-; SSE-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i16> [[VECINIT0_I_I]], i16 [[T0]], i64 1
-; SSE-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I_I]], i16 [[T1]], i64 2
-; SSE-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I]], i16 [[T2]], i64 3
-; SSE-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I_I]], i16 [[T3]], i64 4
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[T2]], i32 3
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[T3]], i64 4
 ; SSE-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I_I]], i16 [[T4]], i64 5
 ; SSE-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I_I]], i16 [[T5]], i64 6
 ; SSE-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I_I]], i16 [[T6]], i64 7
@@ -32,17 +30,18 @@ define <2 x i64> @load_00123456(ptr nocapture noundef readonly %data) {
 ; SSE-NEXT:    ret <2 x i64> [[T7]]
 ;
 ; AVX-LABEL: @load_00123456(
-; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 2
-; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3
-; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[DATA]], align 2
-; AVX-NEXT:    [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX3]], align 2
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINIT2_I_I2:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I2]], i16 [[T2]], i64 3
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[VECINIT7_I_I1:%.*]] = shufflevector <8 x i16> [[VECINIT3_I_I]], <8 x i16> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX-NEXT:    [[T7:%.*]] = bitcast <8 x i16> [[VECINIT7_I_I1]] to <2 x i64>
+; AVX-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 4
+; AVX-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 6
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[DATA]], align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr [[ARRAYIDX4]], align 2
+; AVX-NEXT:    [[T6:%.*]] = load i16, ptr [[ARRAYIDX6]], align 2
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison>
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[T6]], i32 7
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[T7:%.*]] = bitcast <8 x i16> [[TMP8]] to <2 x i64>
 ; AVX-NEXT:    ret <2 x i64> [[T7]]
 ;
   %arrayidx1 = getelementptr inbounds i16, ptr %data, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index 2a9e40156420a9..21d2aed2407792 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -20,17 +20,15 @@
 define void @lookahead_basic(ptr %array) {
 ; CHECK-LABEL: @lookahead_basic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
-; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
-; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[ARRAY:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[ARRAY]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -80,17 +78,17 @@ entry:
 define void @lookahead_alt1(ptr %array) {
 ; CHECK-LABEL: @lookahead_alt1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
-; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 4
 ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 5
 ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
 ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[ARRAY]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -136,21 +134,19 @@ entry:
 define void @lookahead_alt2(ptr %array) {
 ; CHECK-LABEL: @lookahead_alt2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
-; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
-; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[ARRAY:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr [[ARRAY]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -203,21 +199,18 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
-; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
-; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
-; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
@@ -281,21 +274,18 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
-; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
-; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
-; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
@@ -303,8 +293,9 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S
 ; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT1:%.*]], align 8
 ; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT2:%.*]], align 8
 ; CHECK-NEXT:    store double [[TMP10]], ptr [[EXT3:%.*]], align 8
-; CHECK-NEXT:    store double [[B1]], ptr [[EXT4:%.*]], align 8
-; CHECK-NEXT:    store double [[B1]], ptr [[EXT5:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    store double [[TMP11]], ptr [[EXT4:%.*]], align 8
+; CHECK-NEXT:    store double [[TMP11]], ptr [[EXT5:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -386,21 +377,33 @@ define void @lookahead_crash(ptr %A, ptr %S, ptr %Arg0) {
 
 ; This checks that we choose to group consecutive extracts from the same vectors.
 define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2) {
-; CHECK-LABEL: @ChecksExtractScores(
-; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
-; CHECK-NEXT:    [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
-; CHECK-NEXT:    [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
-; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
-; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @ChecksExtractScores(
+; SSE-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
+; SSE-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY:%.*]], align 4
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; SSE-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
+; SSE-NEXT:    store <2 x double> [[TMP6]], ptr [[STOREARRAY:%.*]], align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ChecksExtractScores(
+; AVX-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
+; AVX-NEXT:    [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
+; AVX-NEXT:    [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
+; AVX-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
+; AVX-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
+; AVX-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
+; AVX-NEXT:    store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
+; AVX-NEXT:    ret void
 ;
   %idx1 = getelementptr inbounds double, ptr %array, i64 1
   %loadA0 = load double, ptr %array, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
index 50b19d01ad58f1..6922df8991b831 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
@@ -6,10 +6,10 @@ define void @test(ptr %block, ptr noalias %pixels, i1 %b) {
 ; CHECK-SAME: ptr [[BLOCK:%.*]], ptr noalias [[PIXELS:%.*]], i1 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> <i1 true, i1 poison, i1 false, i1 false>, i1 [[B]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[BLOCK]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i16> [[TMP2]] to <4 x i8>
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i8> [[TMP4]], <4 x i8> [[TMP1]]
 ; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr [[PIXELS]], align 1
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 593aad82ad5d87..e84c6eda9e60a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -186,10 +186,11 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -200,10 +201,11 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
-; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
 ; SSE2-NEXT:    ret void
@@ -252,10 +254,11 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
-; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
 ; SSE2-NEXT:    ret void
@@ -305,10 +308,11 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2:       lp:
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; SSE2:       ext:
 ; SSE2-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 495a503311ab9e..985d68dfacab9d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -145,33 +145,38 @@ define float @foo3(ptr nocapture readonly %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> <i32 1, i32 poison, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
-; CHECK-NEXT:    [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT:    [[TMP16]] = fadd <4 x float> [[TMP4]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP17]], 121
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP19]] = insertelement <2 x float> [[TMP18]], float [[TMP9]], i32 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 0
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 1
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP16]], i32 2
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP16]], i32 3
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 5b33c6e889363e..726afbac2d7076 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -7,67 +7,55 @@
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; SSE-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; AVX-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; AVX512F-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
@@ -533,181 +521,169 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; SSE-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; SSE-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2
-; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
-; SSE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
+; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; SSE-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; SSE-NEXT:    [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3
-; SSE-NEXT:    [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]]
-; SSE-NEXT:    store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
+; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX2-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX2-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512F-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <16 x ptr> poison, ptr [[TMP1:%.*]], i64 0
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <16 x ptr> [[TMP3]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr float, <16 x ptr> [[TMP4]], <16 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20, i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512F-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512F-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512VL-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <16 x ptr> poison, ptr [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <16 x ptr> [[TMP3]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr float, <16 x ptr> [[TMP4]], <16 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20, i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512VL-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 09d6c77557efaa..418ba65c0aeb6a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -7,67 +7,55 @@
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; SSE-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; SSE-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; AVX-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 44
+; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
-; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; AVX512F-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP4]], i64 1
+; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i64 2
+; AVX512F-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512F-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
@@ -533,181 +521,169 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; SSE-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; SSE-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2
-; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
-; SSE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
+; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; SSE-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; SSE-NEXT:    [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3
-; SSE-NEXT:    [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]]
-; SSE-NEXT:    store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
+; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX2-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX2-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
-; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512F-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = insertelement <16 x ptr> poison, ptr [[TMP1:%.*]], i64 0
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <16 x ptr> [[TMP3]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr float, <16 x ptr> [[TMP4]], <16 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20, i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512F-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512F-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_div(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20>
-; AVX512VL-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <16 x ptr> poison, ptr [[TMP1:%.*]], i64 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <16 x ptr> [[TMP3]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr float, <16 x ptr> [[TMP4]], <16 x i64> <i64 0, i64 10, i64 3, i64 14, i64 17, i64 8, i64 5, i64 20, i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512VL-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
 ; AVX512VL-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
index 249b51592760c0..92a4095c7c57a8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
@@ -71,120 +71,66 @@ define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2
 ;
 ; AVX-LABEL: @compute_min(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; AVX-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
-; AVX-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1
-; AVX-NEXT:    [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2
-; AVX-NEXT:    [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2
-; AVX-NEXT:    [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2
-; AVX-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2
-; AVX-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2
-; AVX-NEXT:    [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3
-; AVX-NEXT:    [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3
-; AVX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2
-; AVX-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2
-; AVX-NEXT:    [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4
-; AVX-NEXT:    [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4
-; AVX-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2
-; AVX-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2
-; AVX-NEXT:    [[TMP14:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP12]], i16 [[TMP13]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5
-; AVX-NEXT:    [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5
-; AVX-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2
-; AVX-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2
-; AVX-NEXT:    [[TMP17:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP15]], i16 [[TMP16]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6
-; AVX-NEXT:    [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6
-; AVX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2
-; AVX-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2
-; AVX-NEXT:    [[TMP20:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP18]], i16 [[TMP19]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7
-; AVX-NEXT:    [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7
-; AVX-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2
-; AVX-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2
-; AVX-NEXT:    [[TMP23:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP21]], i16 [[TMP22]])
-; AVX-NEXT:    [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48
-; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32
-; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]]
-; AVX-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
-; AVX-NEXT:    [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP23]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48
-; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[TMP20]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32
-; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP17]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP14]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]]
-; AVX-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1
+; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX-NEXT:    [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]])
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]])
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX-NEXT:    [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]])
+; AVX-NEXT:    [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64>
+; AVX-NEXT:    [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], <i64 32, i64 48>
+; AVX-NEXT:    [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64>
+; AVX-NEXT:    [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], <i64 48, i64 32>
+; AVX-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]]
+; AVX-NEXT:    [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64>
+; AVX-NEXT:    [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], <i64 16, i64 16>
+; AVX-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]]
+; AVX-NEXT:    [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
+; AVX-NEXT:    [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]]
+; AVX-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0
+; AVX-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
+; AVX-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1
+; AVX-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1
 ; AVX-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
 ;
 ; AVX2-LABEL: @compute_min(
 ; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
-; AVX2-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1
-; AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2
-; AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2
-; AVX2-NEXT:    [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2
-; AVX2-NEXT:    [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4
-; AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2
-; AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2
-; AVX2-NEXT:    [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5
-; AVX2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2
-; AVX2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2
-; AVX2-NEXT:    [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6
-; AVX2-NEXT:    [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_2]], align 2
-; AVX2-NEXT:    [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_2]], align 2
-; AVX2-NEXT:    [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP12]], <2 x i16> [[TMP13]])
-; AVX2-NEXT:    [[TMP15:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64>
-; AVX2-NEXT:    [[TMP16:%.*]] = shl nuw <2 x i64> [[TMP15]], <i64 32, i64 48>
-; AVX2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0
-; AVX2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1
-; AVX2-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[TMP18]], [[TMP17]]
-; AVX2-NEXT:    [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16
-; AVX2-NEXT:    [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
-; AVX2-NEXT:    [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]]
-; AVX2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
-; AVX2-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_6]], align 2
-; AVX2-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_6]], align 2
-; AVX2-NEXT:    [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP19]], <2 x i16> [[TMP20]])
-; AVX2-NEXT:    [[TMP22:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64>
-; AVX2-NEXT:    [[TMP23:%.*]] = shl nuw <2 x i64> [[TMP22]], <i64 32, i64 48>
+; AVX2-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX2-NEXT:    [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]])
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX2-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]])
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX2-NEXT:    [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]])
+; AVX2-NEXT:    [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64>
+; AVX2-NEXT:    [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], <i64 32, i64 48>
+; AVX2-NEXT:    [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64>
+; AVX2-NEXT:    [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], <i64 48, i64 32>
+; AVX2-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]]
+; AVX2-NEXT:    [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64>
+; AVX2-NEXT:    [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], <i64 16, i64 16>
+; AVX2-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]]
+; AVX2-NEXT:    [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
+; AVX2-NEXT:    [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]]
 ; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0
+; AVX2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
 ; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1
-; AVX2-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[TMP25]], [[TMP24]]
-; AVX2-NEXT:    [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16
-; AVX2-NEXT:    [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]]
-; AVX2-NEXT:    [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]]
-; AVX2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1
+; AVX2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1
 ; AVX2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
index 4dea52357e04f3..2f40f42629b17b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,23 +6,24 @@ define i16 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
 ; CHECK-NEXT:    [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8
 ; CHECK-NEXT:    br label [[WHILE:%.*]]
 ; CHECK:       while:
 ; CHECK-NEXT:    [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
-; CHECK-NEXT:    [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[A3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP7]], <8 x i32> <i32 2, i32 3, i32 poison, i32 1, i32 0, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i64> [[TMP8]], i64 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i64> [[TMP9]], i64 [[TMP4]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP11]])
+; CHECK-NEXT:    [[OP_RDX5]] = xor i64 [[TMP1]], [[TMP12]]
 ; CHECK-NEXT:    br label [[WHILE]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 8b2b15283601a2..1461ef7eda09d9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -25,10 +25,10 @@ define i32 @test(ptr nocapture readonly %p) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i32> [[TMP0]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP2]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
@@ -97,11 +97,11 @@ define i32 @test2(ptr nocapture readonly %p, ptr nocapture readonly %q) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
@@ -186,12 +186,12 @@ define i32 @test3(ptr nocapture readonly %p, ptr nocapture readonly %q) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
index 40dcc79f79ffce..b52f861afa29b0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
@@ -10,7 +10,7 @@
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
 ; YAML-NEXT:    - Cost:            '-5'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '7'
+; YAML-NEXT:    - TreeSize:        '9'
 
 define void @test(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll
index 7de2cde45525ae..4ac24d44e6c951 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll
@@ -8,18 +8,9 @@
 ; YAML-NEXT:  Function:        test
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-4'
+; YAML-NEXT:    - Cost:            '-3'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '4'
-; YAML-LABEL: --- !Passed
-; YAML-NEXT:  Pass:            slp-vectorizer
-; YAML-NEXT:  Name:            VectorizedList
-; YAML-NEXT:  Function:        test
-; YAML-NEXT:  Args:
-; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-2'
-; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '2'
+; YAML-NEXT:    - TreeSize:        '5'
 
 define <4 x float> @test(ptr %x, float %v, float %a) {
 ; CHECK-LABEL: define <4 x float> @test(
@@ -27,12 +18,11 @@ define <4 x float> @test(ptr %x, float %v, float %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> poison, float [[V]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]]
-; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[V]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    ret <4 x float> [[TMP7]]
 ;
   %gep1 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 1
   %x0 = load float, ptr %x, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 3bc6e64606e399..13d6d6a2e56220 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
   ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
   ; YAML-NEXT:   - Cost:            '-1'
   ; YAML-NEXT:   - String:          ' and with tree size '
-  ; YAML-NEXT:   - TreeSize:        '7'
+  ; YAML-NEXT:   - TreeSize:        '8'
 entry:
   %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
   %idx0 = load i32, ptr %off0.1, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
index f1cd42a2c404af..a5f3cd0e4f168c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -12,12 +12,11 @@ define i32 @foo(ptr %diff) #0 {
 ; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DIFF:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DIFF]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[ARRAYIDX6]], align 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[A_088]]
@@ -88,7 +87,7 @@ for.body:                                         ; preds = %for.body, %entry
   ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
   ; YAML-NEXT:   - Cost:            '-5'
   ; YAML-NEXT:   - String:          ' and with tree size '
-  ; YAML-NEXT:   - TreeSize:        '4'
+  ; YAML-NEXT:   - TreeSize:        '5'
 
   ; YAML:      --- !Passed
   ; YAML-NEXT: Pass:            slp-vectorizer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
index 6d96d6d29cd59e..b60bd6d4b1e6b8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
@@ -64,7 +64,7 @@ for.body:                                         ; preds = %for.body, %entry
   ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
   ; YAML-NEXT:   - Cost:            '-1'
   ; YAML-NEXT:   - String:          ' and with tree size '
-  ; YAML-NEXT:   - TreeSize:        '4'
+  ; YAML-NEXT:   - TreeSize:        '6'
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
index 1a6ff2385905b3..2c789165acf96e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
@@ -10,19 +10,20 @@ define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227)
 ; CHECK-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
 ; CHECK-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x ptr> <ptr null, ptr null, ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[I250]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 1, i32 3, i32 0, i32 2, i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP2]], <8 x ptr> poison, <4 x i32> <i32 2, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I248]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I245]], i32 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x ptr> [[TMP9]], <8 x ptr> <ptr null, ptr null, ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 8, i32 9, i32 1, i32 0, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP13]], false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
index 787bd39759dc78..3d4342b18efb03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
@@ -9,11 +9,11 @@ define ptr @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP5:%.*]], [[BODY]] ], [ [[TMP1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP6:%.*]], [[BODY]] ], [ [[TMP1]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr null, align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP5]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> [[SHUFFLE]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> [[TMP5]], <2 x double> [[TMP2]])
 ; CHECK-NEXT:    br label [[BODY]]
 ;
 entry:
@@ -54,8 +54,8 @@ define void @test1(ptr %agg.result, ptr %this) {
 ; CHECK:       return:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ [[TMP1]], [[IF_END]] ], [ <float 1.000000e+00, float 0.000000e+00>, [[LOR_LHS_FALSE]] ], [ <float 1.000000e+00, float 0.000000e+00>, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[C_I_I_I:%.*]] = getelementptr inbounds float, ptr [[AGG_RESULT:%.*]], i32 2
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    store <2 x float> [[SHUFFLE]], ptr [[C_I_I_I]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[C_I_I_I]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -103,10 +103,10 @@ define void @test2(ptr %p1, ptr %p2) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> <double 4.000000e+00, double 4.100000e+00>, [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], <double 2.000000e+00, double 2.100000e+00>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP10]], <double 3.000000e+00, double 3.100000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x double> [ [[TMP11]], [[BB1]] ], [ [[TMP16:%.*]], [[BB6:%.*]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP15:%.*]], [[BB6:%.*]] ]
 ; CHECK-NEXT:    [[X0:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x double>, ptr [[X0]], align 8
 ; CHECK-NEXT:    br i1 poison, label [[BB3:%.*]], label [[BB6]]
@@ -117,8 +117,7 @@ define void @test2(ptr %p1, ptr %p2) {
 ; CHECK:       bb5:
 ; CHECK-NEXT:    br label [[BB6]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ]
-; CHECK-NEXT:    [[TMP16]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP15]] = phi <2 x double> [ [[TMP13]], [[BB2]] ], [ [[TMP14]], [[BB4]] ], [ [[TMP14]], [[BB5]] ]
 ; CHECK-NEXT:    br label [[BB2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index cfbbe14186b501..c949460d02ec2e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -6,12 +6,12 @@ define void @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 0, i64 7, i64 33, i64 1>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -58,14 +58,14 @@ define void @test1() {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 0, i64 7, i64 33, i64 1>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -112,11 +112,11 @@ define void @test_div() {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 0, i64 7, i64 33, i64 1>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv <4 x i32> [[TMP2]], <i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -164,11 +164,11 @@ define void @test_rem() {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 0, i64 7, i64 33, i64 1>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = urem <4 x i32> [[TMP2]], <i32 1, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 30f328293cdaa3..722463cc3e4c30 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -12,9 +12,9 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x float> [[TMP10]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
 ; CHECK-NEXT:    store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
index f0e734d8c5aefe..ccb3121beac38c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -9,30 +9,26 @@ define  void @foo (ptr %A, ptr %B, ptr %Result) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 256, 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP16:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP15:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], ptr [[A:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP8]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP16:%.*]] = fsub <2 x float> [[TMP11]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[TMP11]], [[TMP15]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP20]] = fadd <2 x float> [[TMP2]], [[TMP21]]
-; CHECK-NEXT:    [[TMP18]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP15]] = fadd <2 x float> [[TMP2]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[RESULT:%.*]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[RESULT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/return.ll b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
index 31b328ca2f5139..4ffcaf1359428a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/return.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
@@ -42,13 +42,13 @@ entry:
 define double @return2(ptr nocapture readonly %x) {
 ; CHECK-LABEL: @return2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    ret double [[ADD5]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index 82085ade519e23..08fce7c452ee6e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
@@ -23,11 +23,12 @@ define void @test() {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
index dadf5992ba288d..8ae85ab6c2e941 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -5,23 +5,19 @@ define void @test(i1 %c, ptr %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG:%.*]], align 8
-; CHECK-NEXT:    [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24
+; CHECK-NEXT:    [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP1]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    br label [[JOIN:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
 ; CHECK-NEXT:    [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP4]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x i64> [ [[TMP5]], [[IF]] ], [ [[TMP10]], [[ELSE]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i64> [ [[TMP3]], [[IF]] ], [ [[TMP6]], [[ELSE]] ]
 ; CHECK-NEXT:    ret void
 ;
   br i1 %c, label %if, label %else
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
index f682851b0406ee..631481d2651efb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -11,12 +11,11 @@ define i32 @foo(ptr nocapture readonly %diff) #0 {
 ; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DIFF:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i64 [[TMP0]], 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DIFF]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[ARRAYIDX6]], align 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[A_088]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
index a009b1eaf65f13..fe0dc814cc8060 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
@@ -6,14 +6,13 @@ define void @wombat(ptr %ptr, ptr %ptr1) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PTR1:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP1]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> undef, <4 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP6]]
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP27]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> undef, <4 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP27]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
index 9810d50beea736..68fb6c6202ce7f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
@@ -11,84 +11,51 @@ declare double @llvm.sin.f64(double)
 
 define void @test() {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[A0:%.*]] = load double, ptr @src, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 2, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]])
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 1, i32 5>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr @dst, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store <2 x double> [[TMP12]], ptr @dst, align 8
 ; CHECK-NEXT:    ret void
 ;
 ; VECLIB-LABEL: @test(
-; VECLIB-NEXT:    [[A0:%.*]] = load double, ptr @src, align 8
-; VECLIB-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
-; VECLIB-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
-; VECLIB-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
-; VECLIB-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
-; VECLIB-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
-; VECLIB-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
-; VECLIB-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
-; VECLIB-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; VECLIB-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
+; VECLIB-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; VECLIB-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 2, i32 6>
 ; VECLIB-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP2]])
-; VECLIB-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
-; VECLIB-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
-; VECLIB-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP5]])
-; VECLIB-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; VECLIB-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
+; VECLIB-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 3, i32 7>
+; VECLIB-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP4]])
+; VECLIB-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 0, i32 4>
+; VECLIB-NEXT:    [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]])
+; VECLIB-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 1, i32 5>
 ; VECLIB-NEXT:    [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
-; VECLIB-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
-; VECLIB-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
-; VECLIB-NEXT:    [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
-; VECLIB-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
-; VECLIB-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
-; VECLIB-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
-; VECLIB-NEXT:    store <2 x double> [[TMP15]], ptr @dst, align 8
+; VECLIB-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP5]]
+; VECLIB-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP9]]
+; VECLIB-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]]
+; VECLIB-NEXT:    store <2 x double> [[TMP12]], ptr @dst, align 8
 ; VECLIB-NEXT:    ret void
 ;
 ; AMDLIBM-LABEL: @test(
-; AMDLIBM-NEXT:    [[A0:%.*]] = load double, ptr @src, align 8
-; AMDLIBM-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
-; AMDLIBM-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
-; AMDLIBM-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
-; AMDLIBM-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
-; AMDLIBM-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
-; AMDLIBM-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
-; AMDLIBM-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
-; AMDLIBM-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; AMDLIBM-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
+; AMDLIBM-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; AMDLIBM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 2, i32 6>
 ; AMDLIBM-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP2]])
-; AMDLIBM-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
-; AMDLIBM-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
-; AMDLIBM-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP5]])
-; AMDLIBM-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; AMDLIBM-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
+; AMDLIBM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 3, i32 7>
+; AMDLIBM-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP4]])
+; AMDLIBM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 0, i32 4>
+; AMDLIBM-NEXT:    [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]])
+; AMDLIBM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> <i32 1, i32 5>
 ; AMDLIBM-NEXT:    [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
-; AMDLIBM-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
-; AMDLIBM-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
-; AMDLIBM-NEXT:    [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
-; AMDLIBM-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
-; AMDLIBM-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
-; AMDLIBM-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
-; AMDLIBM-NEXT:    store <2 x double> [[TMP15]], ptr @dst, align 8
+; AMDLIBM-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP5]]
+; AMDLIBM-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP9]]
+; AMDLIBM-NEXT:    [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]]
+; AMDLIBM-NEXT:    store <2 x double> [[TMP12]], ptr @dst, align 8
 ; AMDLIBM-NEXT:    ret void
 ;
   %a0 = load double, ptr @src, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 6825f43b5a9eb4..3ef013b4b18281 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -7,29 +7,14 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
 ; CHECK-LABEL: @_Z4testP1S(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 15
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 6
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
-; CHECK-NEXT:    [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12
-; CHECK-NEXT:    [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4
-; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5
-; CHECK-NEXT:    [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]]
-; CHECK-NEXT:    store <8 x i32> [[TMP11]], ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <8 x i32> <i32 3, i32 7, i32 6, i32 4, i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -106,11 +91,9 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add
 ; CHECK-NEXT:    [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
-; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
+; CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -165,14 +148,12 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:    store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    store <8 x i32> [[TMP8]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
index ba83ff096c9acb..c80e5e1e79e81b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
@@ -4,20 +4,16 @@
 define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    store <8 x i32> [[TMP10]], ptr [[S:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[ADDR:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x ptr> poison, ptr [[P:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x ptr> [[TMP1]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <16 x ptr> [[TMP2]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP4]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    store <8 x i32> [[TMP8]], ptr [[S:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
index 2f6868d8dfd628..4704bb77b5af33 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
@@ -10,7 +10,7 @@
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
 ; YAML-NEXT:    - Cost:            '-7'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '6'
+; YAML-NEXT:    - TreeSize:        '8'
 define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
 ; CHECK-LABEL: @stores(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
@@ -66,7 +66,7 @@ define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
 ; YAML-NEXT:    - Cost:            '-9'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '6'
+; YAML-NEXT:    - TreeSize:        '8'
 define <4 x i64> @insertelems(ptr noalias %in, ptr noalias %inn) {
 ; CHECK-LABEL: @insertelems(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
index 87063fc3f7a820..3db96d2d5c2a07 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
@@ -7,19 +7,13 @@
 define void @test_supernode_add(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
 ; ENABLED-LABEL: @test_supernode_add(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
 ; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
@@ -54,19 +48,13 @@ entry:
 define void @test_supernode_addsub(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
 ; ENABLED-LABEL: @test_supernode_addsub(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
 ; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
@@ -102,22 +90,16 @@ entry:
 define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
 ; ENABLED-LABEL: @test_supernode_addsub_alt(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]]
-; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
-; ENABLED-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]]
+; ENABLED-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]]
 ; ENABLED-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
 ; ENABLED-NEXT:    store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
@@ -177,18 +159,15 @@ entry:
 define void @supernode_scheduling(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Darray, ptr %Sarray) {
 ; ENABLED-LABEL: @supernode_scheduling(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
 ; ENABLED-NEXT:    [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
-; ENABLED-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; ENABLED-NEXT:    [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8
 ; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B1]], i32 1
-; ENABLED-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1
-; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
+; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[D]], i32 1
+; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
 ; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
index 243087c6d8d95b..fd3c1a57aff344 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
@@ -5,14 +5,12 @@
 define void @vec3_vectorize_call(ptr %Colour, float %0) {
 ; NON-POW2-LABEL: @vec3_vectorize_call(
 ; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[COLOUR:%.*]], align 4
-; NON-POW2-NEXT:    [[ARRAYIDX91_I:%.*]] = getelementptr float, ptr [[COLOUR]], i64 1
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX91_I]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
-; NON-POW2-NEXT:    [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 0
-; NON-POW2-NEXT:    [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 1
-; NON-POW2-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
-; NON-POW2-NEXT:    store <3 x float> [[TMP6]], ptr [[COLOUR]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POW2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
 ; NON-POW2-NEXT:    ret void
 ;
 ; POW2-ONLY-LABEL: @vec3_vectorize_call(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
index 7bcb2ece779210..7e8f4fb432fcd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -13,11 +13,8 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -27,10 +24,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
-; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
-; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -39,18 +32,26 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
-; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
+; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> [[TMP8]], <4 x i32> <i32 4, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
+; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T34]], i32 6
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
index 82f8aa5f9be1be..644019a58016f4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -13,11 +13,8 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -27,10 +24,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
-; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
-; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -39,18 +32,26 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
-; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
+; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> [[TMP8]], <4 x i32> <i32 4, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
+; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[T34]], i32 6
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 69ecf1852aedd7..5739ab1affd781 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,11 +7,8 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -21,11 +18,7 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
-; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
-; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
 ; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -35,16 +28,22 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
 ; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
+; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[T32]], i32 2
 ; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
 ; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
index 6ac6884ca5377f..2ded502bd98f80 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll
@@ -8,20 +8,21 @@ define void @test(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
 ; CHECK-NEXT:    [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 4, i32 5, i32 poison, i32 3>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[RESULT]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <4 x float> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x float> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    store <4 x float> [[TMP13]], ptr [[RESULT]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
index 4c3c712419a2e1..865fd5d42ad759 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll
@@ -4,13 +4,13 @@
 define void @test(ptr %isec) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[ISEC:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ISEC]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[ISEC]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[ISEC:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[ISEC]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:



More information about the llvm-commits mailing list