[llvm] d00579b - Revert "[SLP]Reduce number of alternate instruction, where possible"

Sun Feb 2 06:01:51 PST 2025

Author: Martin Storsjö
Date: 2025-02-02T15:56:08+02:00
New Revision: d00579be39e8a470d7a0ff79ff6deadf9e003781

URL: https://github.com/llvm/llvm-project/commit/d00579be39e8a470d7a0ff79ff6deadf9e003781
DIFF: https://github.com/llvm/llvm-project/commit/d00579be39e8a470d7a0ff79ff6deadf9e003781.diff

LOG: Revert "[SLP]Reduce number of alternate instruction, where possible"

This reverts commit d5a7a483a65f830a0c7a931781bc90046dc67ff4.

That commit triggers failed asserts, see
https://github.com/llvm/llvm-project/pull/123360 for details.

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
    llvm/lib/Target/X86/X86TargetTransformInfo.h
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
    llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
    llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
    llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
    llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
    llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
    llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
    llvm/test/Transforms/SLPVectorizer/addsub.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 6effc919ecb751..ee93aba0c015a8 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1759,10 +1759,6 @@ class TargetTransformInfo {
   /// scalable version of the vectorized loop.
   bool preferFixedOverScalableIfEqualCost() const;
 
-  /// \returns True if target prefers SLP vectorizer with altermate opcode
-  /// vectorization, false - otherwise.
-  bool preferAlternateOpcodeVectorization() const;
-
   /// \returns True if the target prefers reductions in loop.
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const;
@@ -2310,7 +2306,6 @@ class TargetTransformInfo::Concept {
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const = 0;
   virtual bool preferFixedOverScalableIfEqualCost() const = 0;
-  virtual bool preferAlternateOpcodeVectorization() const = 0;
   virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                                      ReductionFlags) const = 0;
   virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@@ -3111,9 +3106,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool preferFixedOverScalableIfEqualCost() const override {
     return Impl.preferFixedOverScalableIfEqualCost();
   }
-  bool preferAlternateOpcodeVectorization() const override {
-    return Impl.preferAlternateOpcodeVectorization();
-  }
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const override {
     return Impl.preferInLoopReduction(Opcode, Ty, Flags);

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 0e4cb00b63049b..b51663adcd8d06 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -989,8 +989,6 @@ class TargetTransformInfoImplBase {
 
   bool preferFixedOverScalableIfEqualCost() const { return false; }
 
-  bool preferAlternateOpcodeVectorization() const { return true; }
-
   bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const {
     return false;

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3c5faab346b4e3..424bb7be233836 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1360,10 +1360,6 @@ bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const {
   return TTIImpl->preferFixedOverScalableIfEqualCost();
 }
 
-bool TargetTransformInfo::preferAlternateOpcodeVectorization() const {
-  return TTIImpl->preferAlternateOpcodeVectorization();
-}
-
 bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                                 ReductionFlags Flags) const {
   return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 7db79b8f474722..9b364391f0fa47 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -111,8 +111,6 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
 
-  bool preferAlternateOpcodeVectorization() const { return false; }
-
   bool preferEpilogueVectorization() const {
     // Epilogue vectorization is usually unprofitable - tail folding or
     // a smaller VF would have been better.  This a blunt hammer - we

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index d344fdb1495178..7786616f89aa6e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -292,7 +292,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
-  bool preferAlternateOpcodeVectorization() const { return false; }
   bool prefersVectorizedAddressing() const;
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c0f095078120da..da073f38b53fb2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -840,35 +840,6 @@ class InstructionsState {
     return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
   }
 
-  /// Checks if main/alt instructions are shift operations.
-  bool isShiftOp() const {
-    return getMainOp()->isShift() && getAltOp()->isShift();
-  }
-
-  /// Checks if main/alt instructions are bitwise logic operations.
-  bool isBitwiseLogicOp() const {
-    return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
-  }
-
-  /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
-  bool isMulDivLikeOp() const {
-    constexpr std::array<unsigned, 8> MulDiv = {
-        Instruction::Mul,  Instruction::FMul, Instruction::SDiv,
-        Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
-        Instruction::URem, Instruction::FRem};
-    return is_contained(MulDiv, getOpcode()) &&
-           is_contained(MulDiv, getAltOpcode());
-  }
-
-  /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
-  bool isAddSubLikeOp() const {
-    constexpr std::array<unsigned, 4> AddSub = {
-        Instruction::Add, Instruction::Sub, Instruction::FAdd,
-        Instruction::FSub};
-    return is_contained(AddSub, getOpcode()) &&
-           is_contained(AddSub, getAltOpcode());
-  }
-
   /// Checks if the current state is valid, i.e. has non-null MainOp
   bool valid() const { return MainOp && AltOp; }
 
@@ -1506,7 +1477,6 @@ class BoUpSLP {
   void deleteTree() {
     VectorizableTree.clear();
     ScalarToTreeEntries.clear();
-    ScalarsInSplitNodes.clear();
     MustGather.clear();
     NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
@@ -1542,7 +1512,7 @@ class BoUpSLP {
   /// should be represented as an empty order, so this is used to
   /// decide if we can canonicalize a computed order.  Undef elements
   /// (represented as size) are ignored.
-  static bool isIdentityOrder(ArrayRef<unsigned> Order) {
+  bool isIdentityOrder(ArrayRef<unsigned> Order) const {
     assert(!Order.empty() && "expected non-empty order");
     const unsigned Sz = Order.size();
     return all_of(enumerate(Order), [&](const auto &P) {
@@ -3242,35 +3212,12 @@ class BoUpSLP {
 
     /// \returns Common mask for reorder indices and reused scalars.
     SmallVector<int> getCommonMask() const {
-      if (State == TreeEntry::SplitVectorize)
-        return {};
       SmallVector<int> Mask;
       inversePermutation(ReorderIndices, Mask);
       ::addMask(Mask, ReuseShuffleIndices);
       return Mask;
     }
 
-    /// \returns The mask for split nodes.
-    SmallVector<int> getSplitMask() const {
-      assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
-             "Expected only split vectorize node.");
-      SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
-      unsigned CommonVF = std::max<unsigned>(
-          CombinedEntriesWithIndices.back().second,
-          Scalars.size() - CombinedEntriesWithIndices.back().second);
-      for (auto [Idx, I] : enumerate(ReorderIndices))
-        Mask[I] =
-            Idx + (Idx >= CombinedEntriesWithIndices.back().second
-                       ? CommonVF - CombinedEntriesWithIndices.back().second
-                       : 0);
-      return Mask;
-    }
-
-    /// Updates (reorders) SplitVectorize node according to the given mask \p
-    /// Mask and order \p MaskOrder.
-    void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
-                          ArrayRef<int> MaskOrder);
-
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
       auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
@@ -3362,8 +3309,6 @@ class BoUpSLP {
                          ///< complex node like select/cmp to minmax, mul/add to
                          ///< fma, etc. Must be used for the following nodes in
                          ///< the pattern, not the very first one.
-      SplitVectorize,    ///< Splits the node into 2 subnodes, vectorizes them
-                         ///< independently and then combines back.
     };
     EntryState State;
 
@@ -3395,7 +3340,7 @@ class BoUpSLP {
     /// The index of this treeEntry in VectorizableTree.
     unsigned Idx = 0;
 
-    /// For gather/buildvector/alt opcode nodes, which are combined from
+    /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
     /// other nodes as a series of insertvector instructions.
     SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
 
@@ -3590,9 +3535,6 @@ class BoUpSLP {
       case CombinedVectorize:
         dbgs() << "CombinedVectorize\n";
         break;
-      case SplitVectorize:
-        dbgs() << "SplitVectorize\n";
-        break;
       }
       if (S) {
         dbgs() << "MainOp: " << *S.getMainOp() << "\n";
@@ -3671,10 +3613,8 @@ class BoUpSLP {
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<int> ReuseShuffleIndices = {},
                           ArrayRef<unsigned> ReorderIndices = {}) {
-    assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
-                         EntryState == TreeEntry::SplitVectorize)) ||
-            (Bundle && EntryState != TreeEntry::NeedToGather &&
-             EntryState != TreeEntry::SplitVectorize)) &&
+    assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
+            (Bundle && EntryState != TreeEntry::NeedToGather)) &&
            "Need to vectorize gather entry?");
     // Gathered loads still gathered? Do not create entry, use the original one.
     if (GatheredLoadsEntriesFirst.has_value() &&
@@ -3708,28 +3648,11 @@ class BoUpSLP {
                   return VL[Idx];
                 });
       InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
-      if (S) {
+      if (S)
         Last->setOperations(S);
-      } else if (EntryState == TreeEntry::SplitVectorize) {
-        auto *MainOp =
-            cast<Instruction>(*find_if(Last->Scalars, IsaPred<Instruction>));
-        auto *AltOp = cast<Instruction>(*find_if(Last->Scalars, [=](Value *V) {
-          auto *I = dyn_cast<Instruction>(V);
-          return I && I->getOpcode() != MainOp->getOpcode();
-        }));
-        Last->setOperations(InstructionsState(MainOp, AltOp));
-      }
-      if (EntryState == TreeEntry::SplitVectorize) {
-        for (Value *V : VL) {
-          auto *I = dyn_cast<Instruction>(V);
-          if (!I)
-            continue;
-          ScalarsInSplitNodes.try_emplace(I, Last);
-        }
-      }
       Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
     }
-    if (!Last->isGather() && Last->State != TreeEntry::SplitVectorize) {
+    if (!Last->isGather()) {
       SmallPtrSet<Value *, 4> Processed;
       for (Value *V : VL) {
         if (isa<PoisonValue>(V))
@@ -3766,7 +3689,7 @@ class BoUpSLP {
         }
       }
       assert(!BundleMember && "Bundle and VL out of sync");
-    } else if (Last->isGather()) {
+    } else {
       // Build a map for gathered scalars to the nodes where they are used.
       bool AllConstsOrCasts = true;
       for (Value *V : VL)
@@ -3841,9 +3764,6 @@ class BoUpSLP {
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
 
-  /// Scalars, used in split vectorize nodes.
-  SmallDenseMap<Value *, TreeEntry *> ScalarsInSplitNodes;
-
   /// Maps a value to the proposed vectorizable size.
   SmallDenseMap<Value *, unsigned> InstrElementSize;
 
@@ -5766,14 +5686,12 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
              }) &&
       (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
     return std::nullopt;
-  if (TE.State == TreeEntry::SplitVectorize ||
-      ((TE.State == TreeEntry::Vectorize ||
-        TE.State == TreeEntry::StridedVectorize) &&
-       (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
-        (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))))) {
-    assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
-           "Alternate instructions are only supported by "
-           "BinaryOperator and CastInst.");
+  if ((TE.State == TreeEntry::Vectorize ||
+       TE.State == TreeEntry::StridedVectorize) &&
+      (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
+       (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
+    assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
+                                 "BinaryOperator and CastInst.");
     return TE.ReorderIndices;
   }
   if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
@@ -6025,30 +5943,6 @@ static void combineOrders(MutableArrayRef<unsigned> Order,
   }
 }
 
-void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
-                                          ArrayRef<int> MaskOrder) {
-  assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
-  SmallVector<int> NewMask(getVectorFactor());
-  SmallVector<int> NewMaskOrder(getVectorFactor());
-  std::iota(NewMask.begin(), NewMask.end(), 0);
-  std::iota(NewMaskOrder.begin(), NewMaskOrder.end(), 0);
-  if (Idx == 0) {
-    copy(Mask, NewMask.begin());
-    copy(MaskOrder, NewMaskOrder.begin());
-  } else {
-    assert(Idx == 1 && "Expected either 0 or 1 index.");
-    unsigned Offset = CombinedEntriesWithIndices.back().second;
-    for (unsigned I : seq<unsigned>(Mask.size())) {
-      NewMask[I + Offset] = Mask[I] + Offset;
-      NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
-    }
-  }
-  reorderScalars(Scalars, NewMask);
-  reorderOrder(ReorderIndices, NewMaskOrder, /*BottomOrder=*/true);
-  if (BoUpSLP::isIdentityOrder(ReorderIndices))
-    ReorderIndices.clear();
-}
-
 void BoUpSLP::reorderTopToBottom() {
   // Maps VF to the graph nodes.
   DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -6083,8 +5977,7 @@ void BoUpSLP::reorderTopToBottom() {
     // Patterns like [fadd,fsub] can be combined into a single instruction in
     // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
     // to take into account their order when looking for the most used order.
-    if (TE->hasState() && TE->isAltShuffle() &&
-        TE->State != TreeEntry::SplitVectorize) {
+    if (TE->hasState() && TE->isAltShuffle()) {
       VectorType *VecTy =
           getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
       unsigned Opcode0 = TE->getOpcode();
@@ -6122,8 +6015,7 @@ void BoUpSLP::reorderTopToBottom() {
       }
       VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize ||
-            TE->State == TreeEntry::SplitVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
       if (TE->State == TreeEntry::Vectorize &&
@@ -6154,8 +6046,7 @@ void BoUpSLP::reorderTopToBottom() {
     for (const TreeEntry *OpTE : OrderedEntries) {
       // No need to reorder this nodes, still need to extend and to use shuffle,
       // just need to merge reordering shuffle and the reuse shuffle.
-      if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE) &&
-          OpTE->State != TreeEntry::SplitVectorize)
+      if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
         continue;
       // Count number of orders uses.
       const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
@@ -6262,8 +6153,6 @@ void BoUpSLP::reorderTopToBottom() {
       // Just do the reordering for the nodes with the given VF.
       if (TE->Scalars.size() != VF) {
         if (TE->ReuseShuffleIndices.size() == VF) {
-          assert(TE->State != TreeEntry::SplitVectorize &&
-                 "Split vectorized not expected.");
           // Need to reorder the reuses masks of the operands with smaller VF to
           // be able to find the match between the graph nodes and scalar
           // operands of the given node during vectorization/cost estimation.
@@ -6271,8 +6160,7 @@ void BoUpSLP::reorderTopToBottom() {
                         [VF, &TE](const EdgeInfo &EI) {
                           return EI.UserTE->Scalars.size() == VF ||
                                  EI.UserTE->Scalars.size() ==
-                                     TE->Scalars.size() ||
-                                 EI.UserTE->State == TreeEntry::SplitVectorize;
+                                     TE->Scalars.size();
                         }) &&
                  "All users must be of VF size.");
           if (SLPReVec) {
@@ -6295,29 +6183,19 @@ void BoUpSLP::reorderTopToBottom() {
           // Update ordering of the operands with the smaller VF than the given
           // one.
           reorderNodeWithReuses(*TE, Mask);
-          // Update orders in user split vectorize nodes.
-          for (EdgeInfo &EI : TE->UserTreeIndices) {
-            if (EI.UserTE->State != TreeEntry::SplitVectorize)
-              continue;
-            EI.UserTE->reorderSplitNode(EI.EdgeIdx, Mask, MaskOrder);
-          }
         }
         continue;
       }
-      if ((TE->State == TreeEntry::SplitVectorize &&
-           TE->ReuseShuffleIndices.empty()) ||
-          ((TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize) &&
-           (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
-                InsertElementInst>(TE->getMainOp()) ||
-            (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp()))))) {
-        assert(
-            (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
-                                     TE->ReuseShuffleIndices.empty())) &&
-            "Alternate instructions are only supported by BinaryOperator "
-            "and CastInst.");
-        // Build correct orders for extract{element,value}, loads,
-        // stores and alternate (split) nodes.
+      if ((TE->State == TreeEntry::Vectorize ||
+           TE->State == TreeEntry::StridedVectorize) &&
+          (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
+               InsertElementInst>(TE->getMainOp()) ||
+           (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
+        assert(!TE->isAltShuffle() &&
+               "Alternate instructions are only supported by BinaryOperator "
+               "and CastInst.");
+        // Build correct orders for extract{element,value}, loads and
+        // stores.
         reorderOrder(TE->ReorderIndices, Mask);
         if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
           TE->reorderOperands(Mask);
@@ -6338,12 +6216,6 @@ void BoUpSLP::reorderTopToBottom() {
         addMask(NewReuses, TE->ReuseShuffleIndices);
         TE->ReuseShuffleIndices.swap(NewReuses);
       }
-      // Update orders in user split vectorize nodes.
-      for (EdgeInfo &EI : TE->UserTreeIndices) {
-        if (EI.UserTE->State != TreeEntry::SplitVectorize)
-          continue;
-        EI.UserTE->reorderSplitNode(EI.EdgeIdx, Mask, MaskOrder);
-      }
     }
   }
 }
@@ -6356,8 +6228,7 @@ bool BoUpSLP::canReorderOperands(
     if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
           return OpData.first == I &&
                  (OpData.second->State == TreeEntry::Vectorize ||
-                  OpData.second->State == TreeEntry::StridedVectorize ||
-                  OpData.second->State == TreeEntry::SplitVectorize);
+                  OpData.second->State == TreeEntry::StridedVectorize);
         }))
       continue;
     if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
@@ -6375,7 +6246,6 @@ bool BoUpSLP::canReorderOperands(
       // node, just reorder reuses mask.
       if (TE->State != TreeEntry::Vectorize &&
           TE->State != TreeEntry::StridedVectorize &&
-          TE->State != TreeEntry::SplitVectorize &&
           TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
         GatherOps.push_back(TE);
       continue;
@@ -6385,7 +6255,6 @@ bool BoUpSLP::canReorderOperands(
                  [&Gather, UserTE, I](TreeEntry *TE) {
                    assert(TE->State != TreeEntry::Vectorize &&
                           TE->State != TreeEntry::StridedVectorize &&
-                          TE->State != TreeEntry::SplitVectorize &&
                           "Only non-vectorized nodes are expected.");
                    if (any_of(TE->UserTreeIndices,
                               [UserTE, I](const EdgeInfo &EI) {
@@ -6415,15 +6284,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SmallVector<TreeEntry *> NonVectorized;
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
     if (TE->State != TreeEntry::Vectorize &&
-        TE->State != TreeEntry::StridedVectorize &&
-        TE->State != TreeEntry::SplitVectorize)
+        TE->State != TreeEntry::StridedVectorize)
       NonVectorized.push_back(TE.get());
     if (std::optional<OrdersType> CurrentOrder =
             getReorderingData(*TE, /*TopToBottom=*/false)) {
       OrderedEntries.insert(TE.get());
       if (!(TE->State == TreeEntry::Vectorize ||
-            TE->State == TreeEntry::StridedVectorize ||
-            TE->State == TreeEntry::SplitVectorize) ||
+            TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
         GathersToOrders.insert(TE.get());
     }
@@ -6442,7 +6309,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
     for (TreeEntry *TE : OrderedEntries) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
-            TE->State == TreeEntry::SplitVectorize ||
             (TE->isGather() && GathersToOrders.contains(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
           !all_of(drop_begin(TE->UserTreeIndices),
@@ -6468,35 +6334,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       return Data1.first->Idx > Data2.first->Idx;
     });
     for (auto &Data : UsersVec) {
-      if (Data.first->State == TreeEntry::SplitVectorize) {
-        assert(
-            Data.second.size() <= 2 &&
-            "Expected not greater than 2 operands for split vectorize node.");
-        if (any_of(Data.second, [](const auto &Op) {
-              return Op.second->UserTreeIndices.size() != 1;
-            }))
-          continue;
-        // Update orders in user split vectorize nodes.
-        assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
-               "Expected exactly 2 entries.");
-        for (const auto &P : Data.first->CombinedEntriesWithIndices) {
-          TreeEntry &OpTE = *VectorizableTree[P.first].get();
-          if (OpTE.isGather() || OpTE.ReorderIndices.empty())
-            continue;
-          SmallVector<int> Mask;
-          inversePermutation(OpTE.ReorderIndices, Mask);
-          SmallVector<int> MaskOrder(OpTE.ReorderIndices.size(),
-                                     PoisonMaskElem);
-          unsigned E = OpTE.ReorderIndices.size();
-          transform(OpTE.ReorderIndices, MaskOrder.begin(), [E](unsigned I) {
-            return I < E ? static_cast<int>(I) : PoisonMaskElem;
-          });
-          Data.first->reorderSplitNode(P.second ? 1 : 0, Mask, MaskOrder);
-          // Clear ordering of the operand.
-          OpTE.ReorderIndices.clear();
-        }
-        continue;
-      }
       // Check that operands are used only in the User node.
       SmallVector<TreeEntry *> GatherOps;
       if (!canReorderOperands(Data.first, Data.second, NonVectorized,
@@ -6653,7 +6490,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         // Gathers are processed separately.
         if (TE->State != TreeEntry::Vectorize &&
             TE->State != TreeEntry::StridedVectorize &&
-            TE->State != TreeEntry::SplitVectorize &&
             (TE->State != TreeEntry::ScatterVectorize ||
              TE->ReorderIndices.empty()))
           continue;
@@ -6724,7 +6560,7 @@ void BoUpSLP::buildExternalUses(
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+    if (Entry->isGather())
       continue;
 
     // For each lane:
@@ -8406,141 +8242,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     return;
   }
 
-  // Tries to build split node.
-  constexpr unsigned SmallNodeSize = 4;
-  auto TrySplitNode = [&, &TTI = *TTI](unsigned SmallNodeSize,
-                                       const InstructionsState &LocalState) {
-    if (VL.size() <= SmallNodeSize || TTI.preferAlternateOpcodeVectorization())
-      return false;
-
-    // Any value is used in split node already - just gather.
-    if (any_of(VL, [&](Value *V) {
-          return ScalarsInSplitNodes.contains(V) || isVectorized(V);
-        })) {
-      if (TryToFindDuplicates(S))
-        newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndices);
-      return true;
-    }
-    SmallVector<Value *> Op1, Op2;
-    OrdersType ReorderIndices(VL.size(), VL.size());
-    SmallBitVector Op1Indices(VL.size());
-    for (auto [Idx, V] : enumerate(VL)) {
-      auto *I = dyn_cast<Instruction>(V);
-      if (!I) {
-        Op1.push_back(V);
-        Op1Indices.set(Idx);
-        continue;
-      }
-      InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI);
-      if (NewS && !NewS.isAltShuffle()) {
-        Op1.push_back(V);
-        Op1Indices.set(Idx);
-        continue;
-      }
-      Op2.push_back(V);
-    }
-    Type *ScalarTy = getValueType(VL.front());
-    VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
-    unsigned Opcode0 = LocalState.getOpcode();
-    unsigned Opcode1 = LocalState.getAltOpcode();
-    SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
-    // Enable split node, only if all nodes are power-of-2/full registers and
-    // do not form legal alternate instruction (like X86 addsub).
-    SmallPtrSet<Value *, 4> UOp1(Op1.begin(), Op1.end());
-    SmallPtrSet<Value *, 4> UOp2(Op2.begin(), Op2.end());
-    if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
-        TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
-        !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), UOp1.size()) ||
-        !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), UOp2.size()) ||
-        !hasFullVectorsOrPowerOf2(TTI, Op1.front()->getType(), Op1.size()) ||
-        !hasFullVectorsOrPowerOf2(TTI, Op2.front()->getType(), Op2.size()))
-      return false;
-    unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
-    for (unsigned Idx : seq<unsigned>(VL.size())) {
-      if (Op1Indices.test(Idx)) {
-        ReorderIndices[Op1Cnt] = Idx;
-        ++Op1Cnt;
-      } else {
-        ReorderIndices[Op2Cnt] = Idx;
-        ++Op2Cnt;
-      }
-    }
-    if (isIdentityOrder(ReorderIndices))
-      ReorderIndices.clear();
-    SmallVector<int> Mask;
-    if (!ReorderIndices.empty())
-      inversePermutation(ReorderIndices, Mask);
-    unsigned NumParts = TTI.getNumberOfParts(VecTy);
-    VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
-    VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
-    // Check non-profitable single register ops, which better to be represented
-    // as alternate ops.
-    if (NumParts >= VL.size())
-      return false;
-    if (LocalState.getMainOp()->isBinaryOp() &&
-        LocalState.getAltOp()->isBinaryOp() &&
-        (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
-         LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) {
-      constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
-      InstructionCost InsertCost = ::getShuffleCost(
-          TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
-      FixedVectorType *SubVecTy =
-          getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
-      InstructionCost NewShuffleCost =
-          ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
-      if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
-        return false;
-      InstructionCost OriginalVecOpsCost =
-          TTI.getArithmeticInstrCost(Opcode0, VecTy, Kind) +
-          TTI.getArithmeticInstrCost(Opcode1, VecTy, Kind);
-      SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
-      for (unsigned Idx : seq<unsigned>(VL.size())) {
-        if (isa<PoisonValue>(VL[Idx]))
-          continue;
-        OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
-      }
-      InstructionCost OriginalCost =
-          OriginalVecOpsCost + ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
-                                                VecTy, OriginalMask, Kind);
-      InstructionCost NewVecOpsCost =
-          TTI.getArithmeticInstrCost(Opcode0, Op1VecTy, Kind) +
-          TTI.getArithmeticInstrCost(Opcode1, Op2VecTy, Kind);
-      InstructionCost NewCost = NewVecOpsCost + InsertCost;
-      // If not profitable to split - exit.
-      if (NewCost >= OriginalCost)
-        return false;
-    }
-
-    SmallVector<Value *> NewVL(VL.size());
-    copy(Op1, NewVL.begin());
-    copy(Op2, std::next(NewVL.begin(), Op1.size()));
-    auto *TE = newTreeEntry(VL, TreeEntry::SplitVectorize, std::nullopt,
-                            LocalState, UserTreeIdx, {}, ReorderIndices);
-    LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
-    auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
-      if (TreeEntry *PrevSE =
-              getSameValuesTreeEntry(Op.front(), Op, /*SameVF=*/true)) {
-        TE->CombinedEntriesWithIndices.emplace_back(PrevSE->Idx,
-                                                    Idx == 0 ? 0 : Op1.size());
-        PrevSE->UserTreeIndices.emplace_back(TE, Idx);
-      } else if (isa<LoadInst>(Op.front())) {
-        // Build gather node for loads, they will be gathered later.
-        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
-                                                    Idx == 0 ? 0 : Op1.size());
-        (void)newTreeEntry(Op, TreeEntry::NeedToGather, std::nullopt,
-                           getSameOpcode(Op, *TLI), {TE, Idx});
-      } else {
-        TE->CombinedEntriesWithIndices.emplace_back(VectorizableTree.size(),
-                                                    Idx == 0 ? 0 : Op1.size());
-        buildTree_rec(Op, Depth, {TE, Idx});
-      }
-    };
-    AddNode(Op1, 0);
-    AddNode(Op2, 1);
-    return true;
-  };
-
   // If all of the operands are identical or constant we have a simple solution.
   // If we deal with insert/extract instructions, they all must have constant
   // indices, otherwise we should gather them, not try to vectorize.
@@ -8626,47 +8327,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
            S.getMainOp()) &&
        !all_of(VL, isVectorLikeInstWithConstOps)) ||
       NotProfitableForVectorization(VL)) {
-    if (!S) {
-      Instruction *MainOp = nullptr;
-      Instruction *AltOp = nullptr;
-      for (Value *V : VL) {
-        if (isa<PoisonValue>(V))
-          continue;
-        auto *I = dyn_cast<Instruction>(V);
-        if (!I) {
-          MainOp = AltOp = nullptr;
-          break;
-        }
-        if (!MainOp) {
-          MainOp = I;
-          continue;
-        }
-        if (MainOp->getOpcode() == I->getOpcode()) {
-          if (I->getParent() != MainOp->getParent()) {
-            MainOp = AltOp = nullptr;
-            break;
-          }
-          continue;
-        }
-        if (!AltOp) {
-          AltOp = I;
-          continue;
-        }
-        if (AltOp->getOpcode() == I->getOpcode()) {
-          if (I->getParent() != AltOp->getParent()) {
-            MainOp = AltOp = nullptr;
-            break;
-          }
-          continue;
-        }
-        MainOp = AltOp = nullptr;
-        break;
-      }
-      // Last chance to try to vectorize alternate node.
-      if (MainOp && AltOp &&
-          TrySplitNode(SmallNodeSize, InstructionsState(MainOp, AltOp)))
-        return;
-    }
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
@@ -8746,10 +8406,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     return;
   }
 
-  // FIXME: investigate if there are profitable cases for VL.size() <= 4.
-  if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S))
-    return;
-
   // Check that every instruction appears once in this bundle.
   if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
     return;
@@ -8782,9 +8438,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    // Last chance to try to vectorize alternate node.
-    if (S.isAltShuffle() && TrySplitNode(SmallNodeSize, S))
-      return;
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndices);
     NonScheduledFirst.insert(VL.front());
@@ -8929,7 +8582,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             TE->dump());
         break;
       case TreeEntry::CombinedVectorize:
-      case TreeEntry::SplitVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
@@ -10145,16 +9797,6 @@ void BoUpSLP::transformNodes() {
       reorderGatherNode(E);
   }
 
-  // Better to use full gathered loads analysis, if there are only 2 loads
-  // gathered nodes each having less than 16 elements.
-  constexpr unsigned VFLimit = 16;
-  bool ForceLoadGather =
-      count_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-        return TE->isGather() && TE->hasState() &&
-               TE->getOpcode() == Instruction::Load &&
-               TE->getVectorFactor() < VFLimit;
-      }) == 2;
-
   // The tree may grow here, so iterate over nodes, built before.
   for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
     TreeEntry &E = *VectorizableTree[Idx];
@@ -10169,42 +9811,6 @@ void BoUpSLP::transformNodes() {
             E.isAltShuffle() || !allSameBlock(VL)) ||
           allConstant(VL) || isSplat(VL))
         continue;
-      if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
-        continue;
-      auto AreReusedScalars =
-          [&](const TreeEntry *TE,
-              function_ref<bool(Value *, const TreeEntry *)> CheckContainer) {
-            return TE->isSame(VL) || all_of(VL, [&](Value *V) {
-                     if (isa<PoisonValue>(V))
-                       return true;
-                     auto *I = dyn_cast<Instruction>(V);
-                     return I && CheckContainer(I, TE);
-                   });
-          };
-      if (E.hasState()) {
-        if (ArrayRef<TreeEntry *> TEs = getTreeEntries(E.getMainOp());
-            !TEs.empty() && any_of(TEs, [&](const TreeEntry *TE) {
-              return AreReusedScalars(TE, [&](Value *V, const TreeEntry *TE) {
-                return is_contained(TEs, TE);
-              });
-            }))
-          continue;
-        if (const TreeEntry *TE = ScalarsInSplitNodes.lookup(E.getMainOp()))
-          if (AreReusedScalars(TE, [&](Value *V, const TreeEntry *TE) {
-                return ScalarsInSplitNodes.lookup(V) == TE;
-              }))
-            continue;
-      } else {
-        // Check if the gather node full copy of split node.
-        auto *It = find_if(VL, IsaPred<Instruction>);
-        if (It != VL.end()) {
-          if (const TreeEntry *TE = ScalarsInSplitNodes.lookup(*It))
-            if (AreReusedScalars(TE, [&](Value *V, const TreeEntry *TE) {
-                  return ScalarsInSplitNodes.lookup(V) == TE;
-                }))
-              continue;
-        }
-      }
       // Try to find vectorizable sequences and transform them into a series of
       // insertvector instructions.
       unsigned StartIdx = 0;
@@ -11414,9 +11020,10 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
     return VE;
   const auto *It =
       find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
-        return (TE->isGather() || TE->State == TreeEntry::SplitVectorize) &&
-               is_contained(TE->UserTreeIndices,
-                            EdgeInfo(const_cast<TreeEntry *>(E), Idx));
+        return TE->isGather() &&
+               find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
+                 return EI.EdgeIdx == Idx && EI.UserTE == E;
+               }) != TE->UserTreeIndices.end();
       });
   assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
   return It->get();
@@ -11496,32 +11103,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     return processBuildVector<ShuffleCostEstimator, InstructionCost>(
         E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
   }
-  if (E->State == TreeEntry::SplitVectorize) {
-    assert(E->CombinedEntriesWithIndices.size() == 2 &&
-           "Expected exactly 2 combined entries.");
-    assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
-    InstructionCost VectorCost = 0;
-    if (E->ReorderIndices.empty()) {
-      VectorCost = ::getShuffleCost(
-          *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
-          E->CombinedEntriesWithIndices.back().second,
-          getWidenedType(
-              ScalarTy,
-              VectorizableTree[E->CombinedEntriesWithIndices.back().first]
-                  ->getVectorFactor()));
-    } else {
-      unsigned CommonVF =
-          std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
-                       ->getVectorFactor(),
-                   VectorizableTree[E->CombinedEntriesWithIndices.back().first]
-                       ->getVectorFactor());
-      VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
-                                    getWidenedType(ScalarTy, CommonVF),
-                                    E->getSplitMask(), CostKind);
-    }
-    LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
-    return VectorCost;
-  }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
   if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
@@ -11606,9 +11187,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               EI.EdgeIdx != 0) {
             auto UserBWIt = MinBWs.find(EI.UserTE);
             Type *UserScalarTy =
-                EI.UserTE->State == TreeEntry::SplitVectorize
-                    ? EI.UserTE->Scalars.front()->getType()
-                    : EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
+                EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
             if (UserBWIt != MinBWs.end())
               UserScalarTy = IntegerType::get(ScalarTy->getContext(),
                                               UserBWIt->second.first);
@@ -12098,7 +11677,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         break;
       }
       case TreeEntry::CombinedVectorize:
-      case TreeEntry::SplitVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected vectorization state.");
       }
@@ -12571,8 +12149,6 @@ bool BoUpSLP::isTreeNotExtendable() const {
   bool Res = false;
   for (unsigned Idx : seq<unsigned>(getTreeSize())) {
     TreeEntry &E = *VectorizableTree[Idx];
-    if (E.State == TreeEntry::SplitVectorize)
-      return false;
     if (!E.isGather())
       continue;
     if (E.hasState() && E.getOpcode() != Instruction::Load)
@@ -12887,8 +12463,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
           TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
       continue;
     }
-    if (TE.hasState() &&
-        (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
+    if (TE.isGather() && TE.hasState()) {
       if (const TreeEntry *E =
               getSameValuesTreeEntry(TE.getMainOp(), TE.Scalars);
           E && E->getVectorFactor() == TE.getVectorFactor()) {
@@ -14164,7 +13739,6 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
   assert(((GatheredLoadsEntriesFirst.has_value() &&
            E->getOpcode() == Instruction::Load && E->isGather() &&
            E->Idx < *GatheredLoadsEntriesFirst) ||
-          E->State == TreeEntry::SplitVectorize ||
           all_of(E->Scalars,
                  [=](Value *V) -> bool {
                    if (E->getOpcode() == Instruction::GetElementPtr &&
@@ -14190,7 +13764,6 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
       }
       assert(((E->getOpcode() == Instruction::GetElementPtr &&
                !isa<GetElementPtrInst>(I)) ||
-              E->State == TreeEntry::SplitVectorize ||
               (isVectorLikeInstWithConstOps(LastInst) &&
                isVectorLikeInstWithConstOps(I)) ||
               (GatheredLoadsEntriesFirst.has_value() &&
@@ -14252,11 +13825,6 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
     return FirstInst;
   };
 
-  if (E->State == TreeEntry::SplitVectorize) {
-    Res = FindLastInst();
-    return *Res;
-  }
-
   // Set insertpoint for gathered loads to the very first load.
   if (GatheredLoadsEntriesFirst.has_value() &&
       E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
@@ -15156,15 +14724,12 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
   // Find the corresponding gather entry and vectorize it.
   // Allows to be more accurate with tree/graph transformations, checks for the
   // correctness of the transformations in many cases.
-  auto *I = find_if(
-      VectorizableTree, [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
-        return TE->isOperandGatherNode({E, NodeIdx}) ||
-               (TE->State == TreeEntry::SplitVectorize &&
-                is_contained(TE->UserTreeIndices, EdgeInfo(E, NodeIdx)));
-      });
-  assert(I != VectorizableTree.end() &&
-         "Gather/split node node is not in the graph.");
-  assert((!I->get()->isGather() || I->get()->UserTreeIndices.size() == 1) &&
+  auto *I = find_if(VectorizableTree,
+                    [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
+                      return TE->isOperandGatherNode({E, NodeIdx});
+                    });
+  assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
+  assert(I->get()->UserTreeIndices.size() == 1 &&
          "Expected only single user for the gather node.");
   assert(I->get()->isSame(VL) && "Expected same list of scalars.");
   return vectorizeTree(I->get(), PostponedPHIs);
@@ -15711,93 +15276,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     E->VectorizedValue = Vec;
     return Vec;
   }
-  if (E->State == TreeEntry::SplitVectorize) {
-    assert(E->CombinedEntriesWithIndices.size() == 2 &&
-           "Expected exactly 2 combined entries.");
-    setInsertPointAfterBundle(E);
-    TreeEntry &OpTE1 =
-        *VectorizableTree[E->CombinedEntriesWithIndices.front().first].get();
-    assert(OpTE1.isSame(
-               ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
-           "Expected same first part of scalars.");
-    Value *Op1 = vectorizeTree(&OpTE1, PostponedPHIs);
-    if (E->VectorizedValue) {
-      LLVM_DEBUG(dbgs() << "SLP: Diamond merged for node " << E->Idx << ".\n";
-                 E->dump());
-      return E->VectorizedValue;
-    }
-    TreeEntry &OpTE2 =
-        *VectorizableTree[E->CombinedEntriesWithIndices.back().first].get();
-    assert(
-        OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
-        "Expected same second part of scalars.");
-    Value *Op2 = vectorizeTree(&OpTE2, PostponedPHIs);
-    if (E->VectorizedValue) {
-      LLVM_DEBUG(dbgs() << "SLP: Diamond merged for node " << E->Idx << ".\n";
-                 E->dump());
-      return E->VectorizedValue;
-    }
-    auto GetOperandSignedness = [&](const TreeEntry *OpE) {
-      bool IsSigned = false;
-      auto It = MinBWs.find(OpE);
-      if (It != MinBWs.end())
-        IsSigned = It->second.second;
-      else
-        IsSigned = any_of(OpE->Scalars, [&](Value *R) {
-          if (isa<PoisonValue>(V))
-            return false;
-          return !isKnownNonNegative(R, SimplifyQuery(*DL));
-        });
-      return IsSigned;
-    };
-    if (cast<VectorType>(Op1->getType())->getElementType() != ScalarTy) {
-      assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
-      Op1 = Builder.CreateIntCast(
-          Op1,
-          getWidenedType(
-              ScalarTy,
-              cast<FixedVectorType>(Op1->getType())->getNumElements()),
-          GetOperandSignedness(&OpTE1));
-    }
-    if (cast<VectorType>(Op2->getType())->getElementType() != ScalarTy) {
-      assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
-      Op2 = Builder.CreateIntCast(
-          Op2,
-          getWidenedType(
-              ScalarTy,
-              cast<FixedVectorType>(Op2->getType())->getNumElements()),
-          GetOperandSignedness(&OpTE2));
-    }
-    if (E->ReorderIndices.empty()) {
-      SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
-      std::iota(
-          Mask.begin(),
-          std::next(Mask.begin(), E->CombinedEntriesWithIndices.back().second),
-          0);
-      Value *Vec = Builder.CreateShuffleVector(Op1, Mask);
-      Vec = createInsertVector(Builder, Vec, Op2,
-                               E->CombinedEntriesWithIndices.back().second);
-      E->VectorizedValue = Vec;
-      return Vec;
-    }
-    unsigned CommonVF =
-        std::max(OpTE1.getVectorFactor(), OpTE2.getVectorFactor());
-    if (getNumElements(Op1->getType()) != CommonVF) {
-      SmallVector<int> Mask(CommonVF, PoisonMaskElem);
-      std::iota(Mask.begin(), std::next(Mask.begin(), OpTE1.getVectorFactor()),
-                0);
-      Op1 = Builder.CreateShuffleVector(Op1, Mask);
-    }
-    if (getNumElements(Op2->getType()) != CommonVF) {
-      SmallVector<int> Mask(CommonVF, PoisonMaskElem);
-      std::iota(Mask.begin(), std::next(Mask.begin(), OpTE2.getVectorFactor()),
-                0);
-      Op2 = Builder.CreateShuffleVector(Op2, Mask);
-    }
-    Value *Vec = Builder.CreateShuffleVector(Op1, Op2, E->getSplitMask());
-    E->VectorizedValue = Vec;
-    return Vec;
-  }
 
   bool IsReverseOrder =
       !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
@@ -16847,15 +16325,13 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
   DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
   for (const TreeEntry *E : PostponedNodes) {
     auto *TE = const_cast<TreeEntry *>(E);
-    if (TE->UserTreeIndices.front().UserTE->State != TreeEntry::SplitVectorize)
-      if (auto *VecTE = getSameValuesTreeEntry(
-              TE->Scalars.front(),
-              TE->UserTreeIndices.front().UserTE->getOperand(
-                  TE->UserTreeIndices.front().EdgeIdx));
-          VecTE && VecTE->isSame(TE->Scalars))
-        // Found gather node which is absolutely the same as one of the
-        // vectorized nodes. It may happen after reordering.
-        continue;
+    if (auto *VecTE = getSameValuesTreeEntry(
+            TE->Scalars.front(), TE->UserTreeIndices.front().UserTE->getOperand(
+                                     TE->UserTreeIndices.front().EdgeIdx));
+        VecTE && VecTE->isSame(TE->Scalars))
+      // Found gather node which is absolutely the same as one of the
+      // vectorized nodes. It may happen after reordering.
+      continue;
     auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
     TE->VectorizedValue = nullptr;
     auto *UserI =
@@ -17349,7 +16825,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+    if (Entry->isGather())
       continue;
 
     assert(Entry->VectorizedValue && "Can't find vectorizable value");
@@ -17406,9 +16882,6 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
                       return EI.UserTE == VectorizableTree.front().get() &&
                              EI.EdgeIdx == UINT_MAX;
                     }))) &&
-          !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
-            !IE->UserTreeIndices.empty() &&
-            is_contained(VectorizableTree.front()->Scalars, I)) &&
           !(GatheredLoadsEntriesFirst.has_value() &&
             IE->Idx >= *GatheredLoadsEntriesFirst &&
             VectorizableTree.front()->isGather() &&
@@ -18432,13 +17905,6 @@ bool BoUpSLP::collectValuesToDemote(
         ToDemote.push_back(E.Idx);
         return IsProfitableToDemote;
       };
-
-  if (E.State == TreeEntry::SplitVectorize)
-    return TryProcessInstruction(
-        BitWidth,
-        {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
-         VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
-
   switch (E.getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index 92027d0043f76c..c431b058f0d2d5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -17,12 +17,15 @@
 
 define void @s116_modified(ptr %a) {
 ; CHECK-LABEL: @s116_modified(
-; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 4
+; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 2
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 3
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[LD0]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[GEP1]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 3cac5c0d9e041a..11fa3337544a1a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -1,272 +1,668 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v | FileCheck %s
+; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s
 ; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-15 | FileCheck %s --check-prefix=THR15
 
 define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) {
 ; CHECK-LABEL: define i32 @test(
 ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP10]] to i32
 ; CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1
+; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], splat (i32 16)
+; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
+; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
+; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]]
+; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16)
+; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
+; CHECK-NEXT:    [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
+; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0
+; CHECK-NEXT:    [[CONV_2:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV_2]], [[TMP43]]
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
+; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]]
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP116:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = sub <4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], <4 x i32> <i32 3, i32 6, i32 1, i32 4>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
-; CHECK-NEXT:    [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32>
-; CHECK-NEXT:    [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16)
-; CHECK-NEXT:    [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]]
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP35:%.*]] = add <4 x i32> [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP36:%.*]] = sub <4 x i32> [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> <i32 3, i32 6, i32 1, i32 4>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP42:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP43:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
-; CHECK-NEXT:    [[TMP44:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; CHECK-NEXT:    [[TMP45:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
-; CHECK-NEXT:    [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; CHECK-NEXT:    [[TMP48:%.*]] = zext <4 x i8> [[TMP47]] to <4 x i32>
-; CHECK-NEXT:    [[TMP49:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i8> [[TMP49]] to <4 x i32>
-; CHECK-NEXT:    [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16)
-; CHECK-NEXT:    [[TMP53:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]]
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP53]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP55:%.*]] = add <4 x i32> [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP56:%.*]] = sub <4 x i32> [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> <i32 3, i32 6, i32 1, i32 4>
-; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
-; CHECK-NEXT:    [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
-; CHECK-NEXT:    [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
-; CHECK-NEXT:    [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
-; CHECK-NEXT:    [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
-; CHECK-NEXT:    [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
-; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP116]], i32 0
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP62]], i64 2)
-; CHECK-NEXT:    [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
-; CHECK-NEXT:    [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
-; CHECK-NEXT:    [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]]
-; CHECK-NEXT:    [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16)
-; CHECK-NEXT:    [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]]
-; CHECK-NEXT:    [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP80:%.*]] = add <4 x i32> [[TMP78]], [[TMP79]]
-; CHECK-NEXT:    [[TMP81:%.*]] = sub <4 x i32> [[TMP78]], [[TMP79]]
-; CHECK-NEXT:    [[TMP82:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <4 x i32> [[TMP82]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP84:%.*]] = add <4 x i32> [[TMP82]], [[TMP83]]
-; CHECK-NEXT:    [[TMP85:%.*]] = sub <4 x i32> [[TMP82]], [[TMP83]]
-; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP84]], <4 x i32> [[TMP85]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP87:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]]
-; CHECK-NEXT:    [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]]
-; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP87]], i64 4)
-; CHECK-NEXT:    [[TMP91:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
-; CHECK-NEXT:    [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
-; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP91]], i64 4)
-; CHECK-NEXT:    [[TMP95:%.*]] = add <8 x i32> [[TMP94]], [[TMP90]]
-; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <4 x i32> [[TMP87]], <4 x i32> [[TMP88]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP97:%.*]] = shufflevector <4 x i32> [[TMP91]], <4 x i32> [[TMP92]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP98:%.*]] = sub <8 x i32> [[TMP96]], [[TMP97]]
-; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP98]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 8, i32 12, i32 9, i32 13, i32 10, i32 14, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP101:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP102:%.*]] = shufflevector <16 x i32> [[TMP100]], <16 x i32> [[TMP101]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP104:%.*]] = shufflevector <16 x i32> [[TMP102]], <16 x i32> [[TMP103]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 18, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP106:%.*]] = shufflevector <16 x i32> [[TMP104]], <16 x i32> [[TMP105]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 poison, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <16 x i32> [[TMP106]], <16 x i32> [[TMP107]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 poison, i32 17, i32 poison, i32 12, i32 poison, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP109:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP110:%.*]] = shufflevector <16 x i32> [[TMP108]], <16 x i32> [[TMP109]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 10, i32 17, i32 12, i32 18, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP111:%.*]] = lshr <16 x i32> [[TMP110]], splat (i32 15)
-; CHECK-NEXT:    [[TMP112:%.*]] = and <16 x i32> [[TMP111]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP113:%.*]] = mul <16 x i32> [[TMP112]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP114:%.*]] = add <16 x i32> [[TMP113]], [[TMP99]]
-; CHECK-NEXT:    [[TMP115:%.*]] = xor <16 x i32> [[TMP114]], [[TMP110]]
-; CHECK-NEXT:    [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP115]])
+; CHECK-NEXT:    [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1
+; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = zext i8 [[TMP52]] to i32
+; CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1
+; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT:    [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
+; CHECK-NEXT:    [[TMP81:%.*]] = sub <2 x i32> [[TMP48]], [[TMP76]]
+; CHECK-NEXT:    [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16)
+; CHECK-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]]
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
+; CHECK-NEXT:    [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32>
+; CHECK-NEXT:    [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
+; CHECK-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]]
+; CHECK-NEXT:    [[TMP170:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
+; CHECK-NEXT:    [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32>
+; CHECK-NEXT:    [[TMP172:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32>
+; CHECK-NEXT:    [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16)
+; CHECK-NEXT:    [[TMP69:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]]
+; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0
+; CHECK-NEXT:    [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]]
+; CHECK-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP176]], [[TMP197]]
+; CHECK-NEXT:    [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1
+; CHECK-NEXT:    [[SUB59_1:%.*]] = add i32 [[XOR_I63_2]], [[ADD112_2]]
+; CHECK-NEXT:    [[SUB47_3:%.*]] = sub i32 [[ADD112_2]], [[XOR_I63_2]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i32> [[TMP70]], i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[SUB59_1]], i32 0
+; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
+; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[SUB47_3]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
+; CHECK-NEXT:    [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]]
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP77]], 15
+; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
+; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
+; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15
+; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
+; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]]
+; CHECK-NEXT:    [[ADD112_1:%.*]] = sub i32 [[TMP87]], [[TMP86]]
+; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
+; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
+; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
+; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; CHECK-NEXT:    [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32>
+; CHECK-NEXT:    [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT:    [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
+; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]]
+; CHECK-NEXT:    [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16)
+; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32>
+; CHECK-NEXT:    [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16)
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
+; CHECK-NEXT:    [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]]
+; CHECK-NEXT:    [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]]
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP126:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]]
+; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]]
+; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0
+; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP127]], [[TMP120]]
+; CHECK-NEXT:    [[TMP166:%.*]] = sub i32 [[TMP120]], [[TMP127]]
+; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0
+; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1
+; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]]
+; CHECK-NEXT:    [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]]
+; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP127]], 15
+; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP129]], 15
+; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
+; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; CHECK-NEXT:    [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
+; CHECK-NEXT:    [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32>
+; CHECK-NEXT:    [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32>
+; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]]
+; CHECK-NEXT:    [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16)
+; CHECK-NEXT:    [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32>
+; CHECK-NEXT:    [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32>
+; CHECK-NEXT:    [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]]
+; CHECK-NEXT:    [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16)
+; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]]
+; CHECK-NEXT:    [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]]
+; CHECK-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]]
+; CHECK-NEXT:    [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]]
+; CHECK-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]]
+; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0
+; CHECK-NEXT:    [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15
+; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
+; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
+; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15
+; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
+; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
+; CHECK-NEXT:    [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15)
+; CHECK-NEXT:    [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535)
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
+; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD78]]
+; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]]
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]]
+; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I_1]], [[ADD103]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
+; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51_1]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]]
+; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
+; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
+; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; CHECK-NEXT:    [[TMP169:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP181:%.*]] = zext <2 x i8> [[TMP169]] to <2 x i32>
+; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0
+; CHECK-NEXT:    [[TMP182:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0
+; CHECK-NEXT:    [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP183]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP191:%.*]] = sub <2 x i32> [[TMP182]], [[TMP184]]
+; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP182]], [[TMP184]]
+; CHECK-NEXT:    [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP191]], <2 x i32> [[TMP192]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP195:%.*]] = lshr <2 x i32> [[TMP181]], splat (i32 15)
+; CHECK-NEXT:    [[TMP196:%.*]] = and <2 x i32> [[TMP195]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP198:%.*]] = mul <2 x i32> [[TMP196]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP202:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55]], i32 0
+; CHECK-NEXT:    [[TMP203:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0
+; CHECK-NEXT:    [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP207:%.*]] = sub <2 x i32> [[TMP203]], [[TMP206]]
+; CHECK-NEXT:    [[TMP210:%.*]] = add <2 x i32> [[TMP203]], [[TMP206]]
+; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD94_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
+; CHECK-NEXT:    [[ADD78_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1
+; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
+; CHECK-NEXT:    [[TMP220:%.*]] = add <2 x i32> [[TMP194]], [[TMP168]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
+; CHECK-NEXT:    [[SUB86_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0
+; CHECK-NEXT:    [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP194]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
+; CHECK-NEXT:    [[TMP175:%.*]] = add <2 x i32> [[TMP198]], [[TMP220]]
+; CHECK-NEXT:    [[TMP221:%.*]] = xor <2 x i32> [[TMP175]], [[TMP181]]
+; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]]
+; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
+; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]]
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 0
+; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]]
+; CHECK-NEXT:    [[XOR_I_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 1
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
+; CHECK-NEXT:    [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
+; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]]
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]]
+; CHECK-NEXT:    [[TMP204:%.*]] = sub i32 [[TMP166]], [[SUB51_1]]
+; CHECK-NEXT:    [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
+; CHECK-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
+; CHECK-NEXT:    [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP199:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP200:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
+; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP204]], [[ADD112_1]]
+; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD113_1]]
+; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
+; CHECK-NEXT:    [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP201]]
+; CHECK-NEXT:    [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15
+; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
+; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
+; CHECK-NEXT:    [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]]
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_2]]
+; CHECK-NEXT:    [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]]
+; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1
+; CHECK-NEXT:    [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]]
+; CHECK-NEXT:    [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]]
+; CHECK-NEXT:    [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]]
+; CHECK-NEXT:    [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]]
+; CHECK-NEXT:    [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0
+; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]]
+; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]]
+; CHECK-NEXT:    [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]]
+; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]]
+; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15)
+; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]]
+; CHECK-NEXT:    [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]]
+; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
+; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
+; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
+; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
+; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]]
+; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]]
+; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1
+; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP190]]
+; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
 ;
 ; THR15-LABEL: define i32 @test(
 ; THR15-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] {
 ; THR15-NEXT:  entry:
+; THR15-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
+; THR15-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
 ; THR15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
+; THR15-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
+; THR15-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
+; THR15-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; THR15-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP1]] to i32
 ; THR15-NEXT:    [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]]
 ; THR15-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
+; THR15-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
+; THR15-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP2]] to i32
 ; THR15-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
+; THR15-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
+; THR15-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
+; THR15-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1
+; THR15-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32
 ; THR15-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; THR15-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; THR15-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
-; THR15-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[TMP1:%.*]] = load i8, ptr null, align 1
-; THR15-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
-; THR15-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; THR15-NEXT:    [[TMP42:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; THR15-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP42]] to <4 x i32>
-; THR15-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; THR15-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
-; THR15-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
-; THR15-NEXT:    [[TMP44:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; THR15-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[TMP44]] to <4 x i32>
-; THR15-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP8]], [[TMP10]]
-; THR15-NEXT:    [[TMP12:%.*]] = shl <4 x i32> [[TMP11]], splat (i32 16)
-; THR15-NEXT:    [[TMP47:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; THR15-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP47]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP47]], [[TMP14]]
-; THR15-NEXT:    [[TMP49:%.*]] = sub <4 x i32> [[TMP47]], [[TMP14]]
-; THR15-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> [[TMP49]], <4 x i32> <i32 3, i32 6, i32 1, i32 4>
-; THR15-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[TMP17]], [[TMP18]]
-; THR15-NEXT:    [[TMP20:%.*]] = sub <4 x i32> [[TMP17]], [[TMP18]]
-; THR15-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; THR15-NEXT:    [[TMP22:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1
-; THR15-NEXT:    [[TMP23:%.*]] = zext <4 x i8> [[TMP22]] to <4 x i32>
-; THR15-NEXT:    [[TMP24:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
-; THR15-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
-; THR15-NEXT:    [[TMP26:%.*]] = sub <4 x i32> [[TMP23]], [[TMP25]]
-; THR15-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; THR15-NEXT:    [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
-; THR15-NEXT:    [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; THR15-NEXT:    [[TMP30:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32>
-; THR15-NEXT:    [[TMP31:%.*]] = sub <4 x i32> [[TMP28]], [[TMP30]]
-; THR15-NEXT:    [[TMP32:%.*]] = shl <4 x i32> [[TMP31]], splat (i32 16)
-; THR15-NEXT:    [[TMP33:%.*]] = add <4 x i32> [[TMP32]], [[TMP26]]
-; THR15-NEXT:    [[TMP34:%.*]] = shufflevector <4 x i32> [[TMP33]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT:    [[TMP35:%.*]] = add <4 x i32> [[TMP33]], [[TMP34]]
-; THR15-NEXT:    [[TMP36:%.*]] = sub <4 x i32> [[TMP33]], [[TMP34]]
-; THR15-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i32> [[TMP35]], <4 x i32> [[TMP36]], <4 x i32> <i32 3, i32 6, i32 1, i32 4>
-; THR15-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[TMP37]], [[TMP38]]
-; THR15-NEXT:    [[TMP40:%.*]] = sub <4 x i32> [[TMP37]], [[TMP38]]
-; THR15-NEXT:    [[TMP41:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> [[TMP40]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; THR15-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
 ; THR15-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; THR15-NEXT:    [[TMP43:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; THR15-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; THR15-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; THR15-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
+; THR15-NEXT:    [[TMP87:%.*]] = zext i8 [[TMP6]] to i32
 ; THR15-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
-; THR15-NEXT:    [[TMP45:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; THR15-NEXT:    [[TMP46:%.*]] = sub <4 x i32> [[TMP43]], [[TMP45]]
+; THR15-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
+; THR15-NEXT:    [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]]
 ; THR15-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; THR15-NEXT:    [[TMP48:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32>
+; THR15-NEXT:    [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32>
 ; THR15-NEXT:    [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT:    [[TMP50:%.*]] = zext <4 x i8> [[TMP16]] to <4 x i32>
-; THR15-NEXT:    [[TMP51:%.*]] = sub <4 x i32> [[TMP48]], [[TMP50]]
-; THR15-NEXT:    [[TMP52:%.*]] = shl <4 x i32> [[TMP51]], splat (i32 16)
-; THR15-NEXT:    [[TMP62:%.*]] = add <4 x i32> [[TMP52]], [[TMP46]]
-; THR15-NEXT:    [[TMP54:%.*]] = shufflevector <4 x i32> [[TMP62]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT:    [[TMP55:%.*]] = add <4 x i32> [[TMP62]], [[TMP54]]
-; THR15-NEXT:    [[TMP56:%.*]] = sub <4 x i32> [[TMP62]], [[TMP54]]
-; THR15-NEXT:    [[TMP57:%.*]] = shufflevector <4 x i32> [[TMP55]], <4 x i32> [[TMP56]], <4 x i32> <i32 3, i32 6, i32 1, i32 4>
-; THR15-NEXT:    [[TMP58:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT:    [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
-; THR15-NEXT:    [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
-; THR15-NEXT:    [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; THR15-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32>
+; THR15-NEXT:    [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]]
+; THR15-NEXT:    [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], splat (i32 16)
+; THR15-NEXT:    [[TMP59:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]]
+; THR15-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32>
+; THR15-NEXT:    [[TMP86:%.*]] = zext i8 [[TMP7]] to i32
+; THR15-NEXT:    [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32>
+; THR15-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]]
+; THR15-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32>
+; THR15-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
+; THR15-NEXT:    [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]]
+; THR15-NEXT:    [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], splat (i32 16)
+; THR15-NEXT:    [[TMP76:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]]
+; THR15-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP76]], [[TMP59]]
+; THR15-NEXT:    [[TMP42:%.*]] = sub <2 x i32> [[TMP59]], [[TMP76]]
+; THR15-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0
+; THR15-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1
+; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]]
+; THR15-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0
+; THR15-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1
+; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]]
+; THR15-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
+; THR15-NEXT:    [[TMP47:%.*]] = load <2 x i8>, ptr null, align 1
+; THR15-NEXT:    [[TMP48:%.*]] = load i8, ptr null, align 1
+; THR15-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32>
+; THR15-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; THR15-NEXT:    [[TMP50:%.*]] = load <2 x i8>, ptr null, align 1
+; THR15-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
+; THR15-NEXT:    [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]]
 ; THR15-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
-; THR15-NEXT:    [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
-; THR15-NEXT:    [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
-; THR15-NEXT:    [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
-; THR15-NEXT:    [[TMP155:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; THR15-NEXT:    [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
-; THR15-NEXT:    [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
-; THR15-NEXT:    [[TMP71:%.*]] = call <4 x i8> @llvm.vector.insert.v4i8.v2i8(<4 x i8> [[TMP70]], <2 x i8> [[TMP53]], i64 2)
-; THR15-NEXT:    [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
-; THR15-NEXT:    [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; THR15-NEXT:    [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
-; THR15-NEXT:    [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; THR15-NEXT:    [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]]
-; THR15-NEXT:    [[TMP165:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16)
-; THR15-NEXT:    [[TMP166:%.*]] = add <4 x i32> [[TMP165]], [[TMP155]]
-; THR15-NEXT:    [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP166]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; THR15-NEXT:    [[TMP80:%.*]] = add <4 x i32> [[TMP166]], [[TMP79]]
-; THR15-NEXT:    [[TMP81:%.*]] = sub <4 x i32> [[TMP166]], [[TMP79]]
-; THR15-NEXT:    [[TMP222:%.*]] = shufflevector <4 x i32> [[TMP80]], <4 x i32> [[TMP81]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; THR15-NEXT:    [[TMP217:%.*]] = shufflevector <4 x i32> [[TMP222]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; THR15-NEXT:    [[TMP223:%.*]] = add <4 x i32> [[TMP222]], [[TMP217]]
-; THR15-NEXT:    [[TMP85:%.*]] = sub <4 x i32> [[TMP222]], [[TMP217]]
-; THR15-NEXT:    [[TMP86:%.*]] = shufflevector <4 x i32> [[TMP223]], <4 x i32> [[TMP85]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; THR15-NEXT:    [[TMP91:%.*]] = add <4 x i32> [[TMP41]], [[TMP21]]
-; THR15-NEXT:    [[TMP88:%.*]] = sub <4 x i32> [[TMP21]], [[TMP41]]
-; THR15-NEXT:    [[TMP89:%.*]] = shufflevector <4 x i32> [[TMP88]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP94:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP89]], <4 x i32> [[TMP91]], i64 4)
-; THR15-NEXT:    [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP61]]
-; THR15-NEXT:    [[TMP92:%.*]] = sub <4 x i32> [[TMP61]], [[TMP86]]
-; THR15-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i32> [[TMP92]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP90:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP93]], <4 x i32> [[TMP87]], i64 4)
-; THR15-NEXT:    [[TMP95:%.*]] = add <8 x i32> [[TMP90]], [[TMP94]]
-; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <4 x i32> [[TMP91]], <4 x i32> [[TMP88]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; THR15-NEXT:    [[TMP99:%.*]] = shufflevector <4 x i32> [[TMP87]], <4 x i32> [[TMP92]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; THR15-NEXT:    [[TMP100:%.*]] = sub <8 x i32> [[TMP97]], [[TMP99]]
-; THR15-NEXT:    [[TMP102:%.*]] = shufflevector <8 x i32> [[TMP95]], <8 x i32> [[TMP100]], <16 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 8, i32 12, i32 9, i32 13, i32 10, i32 14, i32 11, i32 15>
-; THR15-NEXT:    [[TMP103:%.*]] = shufflevector <4 x i32> [[TMP57]], <4 x i32> [[TMP64]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP104:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP105:%.*]] = shufflevector <16 x i32> [[TMP103]], <16 x i32> [[TMP104]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP106:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP107:%.*]] = shufflevector <16 x i32> [[TMP105]], <16 x i32> [[TMP106]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 18, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP108:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP109:%.*]] = shufflevector <16 x i32> [[TMP107]], <16 x i32> [[TMP108]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 poison, i32 18, i32 19>
-; THR15-NEXT:    [[TMP110:%.*]] = shufflevector <4 x i32> [[TMP37]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP111:%.*]] = shufflevector <16 x i32> [[TMP109]], <16 x i32> [[TMP110]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 poison, i32 17, i32 poison, i32 12, i32 poison, i32 14, i32 15>
-; THR15-NEXT:    [[TMP112:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; THR15-NEXT:    [[TMP113:%.*]] = shufflevector <16 x i32> [[TMP111]], <16 x i32> [[TMP112]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 10, i32 17, i32 12, i32 18, i32 14, i32 15>
-; THR15-NEXT:    [[TMP114:%.*]] = lshr <16 x i32> [[TMP113]], splat (i32 15)
-; THR15-NEXT:    [[TMP115:%.*]] = and <16 x i32> [[TMP114]], splat (i32 65537)
-; THR15-NEXT:    [[TMP116:%.*]] = mul <16 x i32> [[TMP115]], splat (i32 65535)
-; THR15-NEXT:    [[TMP117:%.*]] = add <16 x i32> [[TMP116]], [[TMP102]]
-; THR15-NEXT:    [[TMP118:%.*]] = xor <16 x i32> [[TMP117]], [[TMP113]]
-; THR15-NEXT:    [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP118]])
+; THR15-NEXT:    [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; THR15-NEXT:    [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP54]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; THR15-NEXT:    [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; THR15-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
+; THR15-NEXT:    [[TMP57:%.*]] = sub <2 x i32> [[TMP77]], [[TMP56]]
+; THR15-NEXT:    [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], splat (i32 16)
+; THR15-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]]
+; THR15-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
+; THR15-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
+; THR15-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
+; THR15-NEXT:    [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
+; THR15-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
+; THR15-NEXT:    [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
+; THR15-NEXT:    [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32>
+; THR15-NEXT:    [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]]
+; THR15-NEXT:    [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
+; THR15-NEXT:    [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; THR15-NEXT:    [[TMP67:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
+; THR15-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; THR15-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
+; THR15-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16)
+; THR15-NEXT:    [[TMP73:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]]
+; THR15-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; THR15-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]]
+; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP74]], [[TMP75]]
+; THR15-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0
+; THR15-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1
+; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
+; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP80]], [[TMP81]]
+; THR15-NEXT:    [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]]
+; THR15-NEXT:    [[TMP78:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP71:%.*]] = insertelement <2 x i32> [[TMP78]], i32 [[ADD48_3]], i32 0
+; THR15-NEXT:    [[TMP83:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[ADD55_3]], i32 0
+; THR15-NEXT:    [[TMP79:%.*]] = sub <2 x i32> [[TMP71]], [[TMP83]]
+; THR15-NEXT:    [[ADD55_4:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
+; THR15-NEXT:    [[TMP137:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP137]], i32 [[SUB45_3]], i32 0
+; THR15-NEXT:    [[TMP84:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[SUB47_3]], i32 0
+; THR15-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
+; THR15-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD44_2]]
+; THR15-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD44_2]], [[ADD48_4]]
+; THR15-NEXT:    [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15
+; THR15-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; THR15-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
+; THR15-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP44]], 15
+; THR15-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
+; THR15-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
+; THR15-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_4]], [[ADD46_2]]
+; THR15-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD46_2]], [[ADD55_4]]
+; THR15-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15
+; THR15-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
+; THR15-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
+; THR15-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15
+; THR15-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
+; THR15-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; THR15-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0
+; THR15-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1
+; THR15-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP88]], [[TMP89]]
+; THR15-NEXT:    [[SUB102_2:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; THR15-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
+; THR15-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
+; THR15-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
+; THR15-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; THR15-NEXT:    [[TMP91:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; THR15-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP90]], [[TMP91]]
+; THR15-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP91]], [[TMP90]]
+; THR15-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
+; THR15-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
+; THR15-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; THR15-NEXT:    [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; THR15-NEXT:    [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32>
+; THR15-NEXT:    [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; THR15-NEXT:    [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
+; THR15-NEXT:    [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; THR15-NEXT:    [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; THR15-NEXT:    [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; THR15-NEXT:    [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
+; THR15-NEXT:    [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]]
+; THR15-NEXT:    [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], splat (i32 16)
+; THR15-NEXT:    [[TMP102:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32>
+; THR15-NEXT:    [[TMP104:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
+; THR15-NEXT:    [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
+; THR15-NEXT:    [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]]
+; THR15-NEXT:    [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], splat (i32 16)
+; THR15-NEXT:    [[TMP110:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV33]], i32 1
+; THR15-NEXT:    [[TMP111:%.*]] = sub <2 x i32> [[TMP110]], [[TMP103]]
+; THR15-NEXT:    [[TMP112:%.*]] = add <2 x i32> [[TMP109]], [[TMP111]]
+; THR15-NEXT:    [[TMP113:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV]], i32 0
+; THR15-NEXT:    [[TMP114:%.*]] = sub <2 x i32> [[TMP113]], [[TMP95]]
+; THR15-NEXT:    [[TMP115:%.*]] = add <2 x i32> [[TMP101]], [[TMP114]]
+; THR15-NEXT:    [[TMP116:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP115]], <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP117:%.*]] = add <2 x i32> [[TMP112]], [[TMP115]]
+; THR15-NEXT:    [[TMP118:%.*]] = sub <2 x i32> [[TMP115]], [[TMP112]]
+; THR15-NEXT:    [[TMP119:%.*]] = extractelement <2 x i32> [[TMP117]], i32 0
+; THR15-NEXT:    [[TMP120:%.*]] = extractelement <2 x i32> [[TMP117]], i32 1
+; THR15-NEXT:    [[ADD48:%.*]] = add i32 [[TMP120]], [[TMP119]]
+; THR15-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP119]], [[TMP120]]
+; THR15-NEXT:    [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0
+; THR15-NEXT:    [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1
+; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]]
+; THR15-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP121]], [[TMP122]]
+; THR15-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15
+; THR15-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; THR15-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; THR15-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP122]], 15
+; THR15-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
+; THR15-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
+; THR15-NEXT:    [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; THR15-NEXT:    [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32>
+; THR15-NEXT:    [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT:    [[TMP125:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32>
+; THR15-NEXT:    [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT:    [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; THR15-NEXT:    [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT:    [[TMP129:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32>
+; THR15-NEXT:    [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]]
+; THR15-NEXT:    [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], splat (i32 16)
+; THR15-NEXT:    [[TMP138:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32>
+; THR15-NEXT:    [[TMP154:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32>
+; THR15-NEXT:    [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
+; THR15-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP155]], [[TMP134]]
+; THR15-NEXT:    [[TMP170:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16)
+; THR15-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV33_1]], i32 1
+; THR15-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]]
+; THR15-NEXT:    [[TMP171:%.*]] = add <2 x i32> [[TMP170]], [[TMP141]]
+; THR15-NEXT:    [[TMP186:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV_1]], i32 0
+; THR15-NEXT:    [[TMP187:%.*]] = sub <2 x i32> [[TMP186]], [[TMP126]]
+; THR15-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP132]], [[TMP187]]
+; THR15-NEXT:    [[TMP136:%.*]] = add <2 x i32> [[TMP171]], [[TMP142]]
+; THR15-NEXT:    [[TMP149:%.*]] = sub <2 x i32> [[TMP142]], [[TMP171]]
+; THR15-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0
+; THR15-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1
+; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP145]], [[TMP144]]
+; THR15-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]]
+; THR15-NEXT:    [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0
+; THR15-NEXT:    [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1
+; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]]
+; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]]
+; THR15-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP145]], 15
+; THR15-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
+; THR15-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
+; THR15-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP151]], 15
+; THR15-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
+; THR15-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
+; THR15-NEXT:    [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], splat (i32 15)
+; THR15-NEXT:    [[TMP157:%.*]] = and <2 x i32> [[TMP156]], splat (i32 65537)
+; THR15-NEXT:    [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], splat (i32 65535)
+; THR15-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_2]], [[ADD48]]
+; THR15-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_2]]
+; THR15-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
+; THR15-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
+; THR15-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
+; THR15-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
+; THR15-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
+; THR15-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]]
+; THR15-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
+; THR15-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP44]]
+; THR15-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
+; THR15-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP145]]
+; THR15-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; THR15-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]]
+; THR15-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
+; THR15-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
+; THR15-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; THR15-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD48_1]], [[ADD55]]
+; THR15-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD48_1]]
+; THR15-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
+; THR15-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
+; THR15-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
+; THR15-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
+; THR15-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]]
+; THR15-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP86]]
+; THR15-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
+; THR15-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]]
+; THR15-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
+; THR15-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP151]]
+; THR15-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
+; THR15-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]]
+; THR15-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
+; THR15-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
+; THR15-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
+; THR15-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
+; THR15-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]]
+; THR15-NEXT:    [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB45_1]]
+; THR15-NEXT:    [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; THR15-NEXT:    [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; THR15-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP161]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP160]], [[TMP162]]
+; THR15-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP160]], [[TMP162]]
+; THR15-NEXT:    [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> [[TMP164]], <2 x i32> <i32 0, i32 3>
+; THR15-NEXT:    [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]]
+; THR15-NEXT:    [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]]
+; THR15-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
+; THR15-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
+; THR15-NEXT:    [[TMP166:%.*]] = add <2 x i32> [[TMP158]], [[TMP165]]
+; THR15-NEXT:    [[TMP167:%.*]] = xor <2 x i32> [[TMP166]], [[TMP124]]
+; THR15-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP119]], 15
+; THR15-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
+; THR15-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
+; THR15-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
+; THR15-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP119]]
+; THR15-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
+; THR15-NEXT:    [[TMP168:%.*]] = extractelement <2 x i32> [[TMP167]], i32 0
+; THR15-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP168]]
+; THR15-NEXT:    [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1
+; THR15-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]]
+; THR15-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
+; THR15-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[SUB59]]
+; THR15-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB51_1]]
+; THR15-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
+; THR15-NEXT:    [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
+; THR15-NEXT:    [[TMP175:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THR15-NEXT:    [[TMP176:%.*]] = add <2 x i32> [[TMP173]], [[TMP175]]
+; THR15-NEXT:    [[TMP177:%.*]] = sub <2 x i32> [[TMP173]], [[TMP175]]
+; THR15-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP177]], <2 x i32> <i32 0, i32 3>
+; THR15-NEXT:    [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
+; THR15-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
+; THR15-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]]
+; THR15-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]]
+; THR15-NEXT:    [[TMP179:%.*]] = lshr <2 x i32> [[TMP93]], splat (i32 15)
+; THR15-NEXT:    [[TMP180:%.*]] = and <2 x i32> [[TMP179]], splat (i32 65537)
+; THR15-NEXT:    [[TMP181:%.*]] = mul <2 x i32> [[TMP180]], splat (i32 65535)
+; THR15-NEXT:    [[TMP182:%.*]] = add <2 x i32> [[TMP181]], [[TMP178]]
+; THR15-NEXT:    [[TMP183:%.*]] = xor <2 x i32> [[TMP182]], [[TMP93]]
+; THR15-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
+; THR15-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
+; THR15-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
+; THR15-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
+; THR15-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
+; THR15-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
+; THR15-NEXT:    [[TMP184:%.*]] = extractelement <2 x i32> [[TMP183]], i32 0
+; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]]
+; THR15-NEXT:    [[TMP185:%.*]] = extractelement <2 x i32> [[TMP183]], i32 1
+; THR15-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP185]]
+; THR15-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; THR15-NEXT:    ret i32 [[ADD113_3]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index 3243579f11820a..85131758853b3d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -1027,8 +1027,10 @@ define i32 @stride_sum_abs_
diff (ptr %p, ptr %q, i64 %stride) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index b374e877bb38a4..e24c52ba81ddf4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -1,18 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; CHECK-LABEL: @sitofp_uitofp(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
@@ -44,11 +42,9 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; CHECK-LABEL: @fptosi_fptoui(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -79,39 +75,11 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 }
 
 define <8 x float> @fneg_fabs(<8 x float> %a) {
-; SSE2-LABEL: @fneg_fabs(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; SLM-LABEL: @fneg_fabs(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SLM-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SLM-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x float> [[DOTUNCASTED]]
+; CHECK-LABEL: @fneg_fabs(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
+; CHECK-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -158,11 +126,9 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 
 define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; CHECK-LABEL: @sext_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index ddd3dffaafcc5d..0f8751a6da7f57 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -1,18 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; CHECK-LABEL: @sitofp_uitofp(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
@@ -44,11 +42,9 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; CHECK-LABEL: @fptosi_fptoui(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
@@ -79,39 +75,11 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 }
 
 define <8 x float> @fneg_fabs(<8 x float> %a) {
-; SSE2-LABEL: @fneg_fabs(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; SLM-LABEL: @fneg_fabs(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP3:%.*]] = fneg <4 x float> [[TMP1]]
-; SLM-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
-; SLM-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
-;
-; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x float> [[DOTUNCASTED]]
+; CHECK-LABEL: @fneg_fabs(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
+; CHECK-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -158,11 +126,9 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 
 define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; CHECK-LABEL: @sext_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
index 5cee6984df04fe..5a1de4f3e3d7fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -1,47 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
 
 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT:    ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fadd_fsub_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX2-NEXT:    ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @fadd_fsub_v8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fadd_fsub_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -79,43 +49,11 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT:    ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX2-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX512-LABEL: @fmul_fdiv_v8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fmul_fdiv_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -172,10 +110,6 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
 ; AVX-NEXT:    ret <4 x float> [[TMP1]]
 ;
-; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
-; AVX2-NEXT:    ret <4 x float> [[TMP1]]
-;
 ; AVX512-LABEL: @fmul_fdiv_v4f32_const(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
 ; AVX512-NEXT:    ret <4 x float> [[TMP1]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index 9a2f959ac63bc5..046ed781f4c8d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -1,47 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
 
 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT:    ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fadd_fsub_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX2-NEXT:    ret <8 x float> [[TMP3]]
-;
-; AVX512-LABEL: @fadd_fsub_v8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fadd_fsub_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -79,43 +49,11 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SSE-NEXT:    ret <8 x float> [[TMP5]]
-;
-; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; SLM-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
-; AVX2-NEXT:    ret <8 x float> [[TMP5]]
-;
-; AVX512-LABEL: @fmul_fdiv_v8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @fmul_fdiv_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -172,10 +110,6 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
 ; AVX-NEXT:    ret <4 x float> [[TMP1]]
 ;
-; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
-; AVX2-NEXT:    ret <4 x float> [[TMP1]]
-;
 ; AVX512-LABEL: @fmul_fdiv_v4f32_const(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
 ; AVX512-NEXT:    ret <4 x float> [[TMP1]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
index f8c5df99445385..8839fc22817888 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -7,39 +7,11 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; AVX1-LABEL: @add_sub_v8i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX1-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @add_sub_v8i32(
-; AVX2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @add_sub_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @add_sub_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -134,16 +106,14 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x i32> [[TMP5]]
+; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT:    ret <8 x i32> [[TMP5]]
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
@@ -204,16 +174,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP5]]
+; SSE-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP5]]
+; SLM-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
 ; AVX1-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
@@ -531,49 +501,13 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 }
 
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: @add_sub_v8i32_splat(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @add_sub_v8i32_splat(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SLM-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX1-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX1-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX1-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX512-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[TMP5]]
+; CHECK-LABEL: @add_sub_v8i32_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP5]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index b84ef027f67c50..dfa918a6ea4532 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -7,39 +7,11 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; AVX1-LABEL: @add_sub_v8i32(
-; AVX1-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX1-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX2-LABEL: @add_sub_v8i32(
-; AVX2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[TMP3]]
-;
-; AVX512-LABEL: @add_sub_v8i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @add_sub_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -134,16 +106,14 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:    ret <8 x i32> [[TMP5]]
+; SSE-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SLM-NEXT:    ret <8 x i32> [[TMP5]]
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
 ; AVX1-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
@@ -204,16 +174,16 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP5]]
+; SSE-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP5]]
+; SLM-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
 ; AVX1-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
@@ -531,49 +501,13 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 }
 
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
-; SSE-LABEL: @add_sub_v8i32_splat(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @add_sub_v8i32_splat(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SLM-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX1-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX1-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX1-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[TMP5]]
-;
-; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
-; AVX512-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[TMP5]]
+; CHECK-LABEL: @add_sub_v8i32_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP5]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
index 7ed5f33c9dc6cb..b659c10bb2fbf9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll
@@ -7,7 +7,7 @@ define void @test() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 1, 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[ADD]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp samesign ult i32 0, 0
+; CHECK-NEXT:    [[ICMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64
 ; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]]
@@ -16,8 +16,6 @@ define void @test() {
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, i32 [[CALL]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP4]], <4 x i1> [[TMP1]], i64 4)
 ; CHECK-NEXT:    ret void
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
index 70c67ff251d6d5..9fc2b7d6e78656 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/long-full-reg-stores.ll
@@ -9,10 +9,10 @@ define void @test(ptr noalias %0, ptr noalias %1) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[TMP9]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP10]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> [[TMP7]], <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP14]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <6 x i32> <i32 2, i32 4, i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP7]], <6 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 5>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> [[TMP10]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11>
 ; CHECK-NEXT:    store <6 x double> [[TMP13]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40
 ; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr [[TMP21]], align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
index ea497c95d41141..cfbfd0ebc37bca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll
@@ -10,24 +10,22 @@ define i32 @bar() local_unnamed_addr {
 ; CHECK-NEXT:    [[SUB102_1:%.*]] = sub nsw i32 undef, undef
 ; CHECK-NEXT:    [[ADD78_2:%.*]] = add nsw i32 undef, undef
 ; CHECK-NEXT:    [[SUB102_3:%.*]] = sub nsw i32 undef, undef
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SUB102_1]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[ADD94_1]], i32 3
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef>, i32 [[SUB86_1]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[ADD78_1]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[ADD78_2]], i32 5
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[SUB102_3]], i32 6
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 5, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 14>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> <i32 undef, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 poison>, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 13, i32 14, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <8 x i32> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP18]], <8 x i32> [[TMP10]], i64 8)
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 [[SUB102_1]], i32 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[ADD94_1]], i32 5
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 undef, i32 undef, i32 undef, i32 poison, i32 undef, i32 undef, i32 poison>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 7, i32 6, i32 5, i32 4, i32 24, i32 25, i32 26, i32 27, i32 poison, i32 29, i32 30, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[SUB102_3]], i32 12
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i32> [[TMP12]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP20]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]])
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP17]], 16

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
index e9a65bf6d6f0d4..2f49a2e6a212e6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
@@ -6,11 +6,11 @@ define i1 @foo() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TOBOOL_NOT_NOT509_I_2329_I_I:%.*]] = icmp ne i32 0, 0
 ; CHECK-NEXT:    [[STOREMERGE_2333_I_I:%.*]] = select i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[STOREMERGE_2333_I_I]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> <i1 false, i1 false, i1 undef, i1 undef>, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef>, <4 x i1> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TOBOOL_NOT_NOT509_I_1_2_I_I:%.*]] = icmp ne i32 [[STOREMERGE_2333_I_I]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i1> poison, i1 [[TOBOOL_NOT_NOT509_I_1_2_I_I]], i32 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i1> [[TMP0]], i1 [[TOBOOL_NOT_NOT509_I_2329_I_I]], i32 5
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP1]], <4 x i1> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v2i1(<8 x i1> [[TMP2]], <2 x i1> zeroinitializer, i64 6)
 ; CHECK-NEXT:    [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
index 1294a87ff69679..c01c44ff03c153 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -7,14 +7,20 @@ define void @test(i1 %c, ptr %arg) {
 ; CHECK:       if:
 ; CHECK-NEXT:    [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2)
 ; CHECK-NEXT:    br label [[JOIN:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2)
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
index 8ce3e62519b6dc..33fa00c1881da3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
@@ -6,19 +6,23 @@ define i32 @a() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP6]] = load <4 x i8>, ptr null, align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4)
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i8> [[TMP13]], ptr null, align 4
+; CHECK-NEXT:    store <8 x i8> [[TMP23]], ptr null, align 4
+; CHECK-NEXT:    [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1]]
 ;
   br label %1

diff  --git a/llvm/test/Transforms/SLPVectorizer/addsub.ll b/llvm/test/Transforms/SLPVectorizer/addsub.ll
index 6814bc0f566f60..3961250d564518 100644
--- a/llvm/test/Transforms/SLPVectorizer/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/addsub.ll
@@ -387,10 +387,14 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re
 
 define void @vec_shuff_reorder() #0 {
 ; CHECK-LABEL: @vec_shuff_reorder(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @fb, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @fa, align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>