[llvm] 9dc4ced - [SLP]Try partial store vectorization if supported by target.

Mon May 9 09:49:29 PDT 2022

Author: Alexey Bataev
Date: 2022-05-09T09:48:15-07:00
New Revision: 9dc4ced204d1e918d4c8d3279e52197e9a5abc94

URL: https://github.com/llvm/llvm-project/commit/9dc4ced204d1e918d4c8d3279e52197e9a5abc94
DIFF: https://github.com/llvm/llvm-project/commit/9dc4ced204d1e918d4c8d3279e52197e9a5abc94.diff

LOG: [SLP]Try partial store vectorization if supported by target.

We can try to vectorize number of stores less than MinVecRegSize
/ scalar_value_size, if it is allowed by target. Gives an extra
opportunity for the vectorization.

Fixes PR54985.

Differential Revision: https://reviews.llvm.org/D124284

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-and-const-load.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
    llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
    llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
    llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll
    llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
    llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
    llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll
    llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
    llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 19a6b14c5487c..376e1d13cdc10 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -950,6 +950,17 @@ class TargetTransformInfo {
   /// Currently only used by the SLP vectorizer.
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
 
+  /// \return The minimum vectorization factor for the store instruction. Given
+  /// the initial estimation of the minimum vector factor and store value type,
+  /// it tries to find possible lowest VF, which still might be profitable for
+  /// the vectorization.
+  /// \param VF Initial estimation of the minimum vector factor.
+  /// \param ScalarMemTy Scalar memory type of the store operation.
+  /// \param ScalarValTy Scalar type of the stored value.
+  /// Currently only used by the SLP vectorizer.
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const;
+
   /// \return True if it should be considered for address type promotion.
   /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
   /// profitable without finding other extensions fed by the same input.
@@ -1633,6 +1644,8 @@ class TargetTransformInfo::Concept {
   virtual ElementCount getMinimumVF(unsigned ElemWidth,
                                     bool IsScalable) const = 0;
   virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
+  virtual unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                                     Type *ScalarValTy) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() const = 0;
@@ -2136,6 +2149,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override {
     return Impl.getMaximumVF(ElemWidth, Opcode);
   }
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const override {
+    return Impl.getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
+  }
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
     return Impl.shouldConsiderAddressTypePromotion(

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index df0a2491b6ff5..fe937adde411b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -424,6 +424,7 @@ class TargetTransformInfoImplBase {
   }
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { return 0; }
+  unsigned getStoreMinimumVF(unsigned VF, Type *, Type *) const { return VF; }
 
   bool shouldConsiderAddressTypePromotion(
       const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 94791e7680512..a2b9860c46e6c 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -312,6 +312,26 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
   }
 
+  unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                             Type *ScalarValTy) const {
+    auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
+      auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
+      EVT VT = getTLI()->getValueType(DL, SrcTy);
+      if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
+          getTLI()->isOperationCustom(ISD::STORE, VT))
+        return true;
+
+      EVT ValVT =
+          getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
+      EVT LegalizedVT =
+          getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
+      return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
+    };
+    while (VF > 2 && IsSupportedByTarget(VF))
+      VF /= 2;
+    return VF;
+  }
+
   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
                           const DataLayout &DL) const {
     EVT VT = getTLI()->getValueType(DL, Ty);

diff  --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 1792b241bd92a..b41f3efc5b55b 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -133,7 +133,7 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
   bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
-                           unsigned Idx);
+                           unsigned Idx, unsigned MinVF);
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
 

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index df3d33229909b..384d95a5b70e5 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -640,6 +640,11 @@ unsigned TargetTransformInfo::getMaximumVF(unsigned ElemWidth,
   return TTIImpl->getMaximumVF(ElemWidth, Opcode);
 }
 
+unsigned TargetTransformInfo::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
+                                                Type *ScalarValTy) const {
+  return TTIImpl->getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
+}
+
 bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
   return TTIImpl->shouldConsiderAddressTypePromotion(

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ce58d839fd64b..4f3db8c83394c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8737,11 +8737,8 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // If V is a store, just return the width of the stored value (or value
   // truncated just before storing) without traversing the expression tree.
   // This is the common case.
-  if (auto *Store = dyn_cast<StoreInst>(V)) {
-    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
-      return DL->getTypeSizeInBits(Trunc->getSrcTy());
+  if (auto *Store = dyn_cast<StoreInst>(V))
     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
-  }
 
   if (auto *IEI = dyn_cast<InsertElementInst>(V))
     return getVectorElementSize(IEI->getOperand(1));
@@ -9175,11 +9172,10 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 }
 
 bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
-                                            unsigned Idx) {
+                                            unsigned Idx, unsigned MinVF) {
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
                     << "\n");
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
-  const unsigned MinVF = R.getMinVecRegSize() / Sz;
   unsigned VF = Chain.size();
 
   if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
@@ -9318,9 +9314,15 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
     unsigned EltSize = R.getVectorElementSize(Operands[0]);
     unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
 
-    unsigned MinVF = R.getMinVF(EltSize);
     unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
                               MaxElts);
+    auto *Store = cast<StoreInst>(Operands[0]);
+    Type *StoreTy = Store->getValueOperand()->getType();
+    Type *ValueTy = StoreTy;
+    if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+      ValueTy = Trunc->getSrcTy();
+    unsigned MinVF = TTI->getStoreMinimumVF(
+        R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);
 
     // FIXME: Is division-by-2 the correct step? Should we assert that the
     // register size is a power-of-2?
@@ -9330,7 +9332,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
         if (!VectorizedStores.count(Slice.front()) &&
             !VectorizedStores.count(Slice.back()) &&
-            vectorizeStoreChain(Slice, R, Cnt)) {
+            vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
           // Mark the vectorized stores so that we don't vectorize them again.
           VectorizedStores.insert(Slice.begin(), Slice.end());
           Changed = true;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll
index d76a4c5034f04..e62d6da8e34dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-load.ll
@@ -93,102 +93,18 @@ entry:
 define void @add8(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
 ; SSE-LABEL: @add8(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
-; SSE-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
-; SSE-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
-; SSE-NEXT:    store i8 [[ADD]], ptr [[R]], align 1
-; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
-; SSE-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
-; SSE-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
-; SSE-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP3]], [[TMP2]]
-; SSE-NEXT:    store i8 [[ADD_1]], ptr [[ARRAYIDX2_1]], align 1
-; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
-; SSE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
-; SSE-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
-; SSE-NEXT:    [[ADD_2:%.*]] = add i8 [[TMP5]], [[TMP4]]
-; SSE-NEXT:    store i8 [[ADD_2]], ptr [[ARRAYIDX2_2]], align 1
-; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
-; SSE-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
-; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
-; SSE-NEXT:    [[ADD_3:%.*]] = add i8 [[TMP7]], [[TMP6]]
-; SSE-NEXT:    store i8 [[ADD_3]], ptr [[ARRAYIDX2_3]], align 1
-; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
-; SSE-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
-; SSE-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
-; SSE-NEXT:    [[ADD_4:%.*]] = add i8 [[TMP9]], [[TMP8]]
-; SSE-NEXT:    store i8 [[ADD_4]], ptr [[ARRAYIDX2_4]], align 1
-; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
-; SSE-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
-; SSE-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[ADD_5:%.*]] = add i8 [[TMP11]], [[TMP10]]
-; SSE-NEXT:    store i8 [[ADD_5]], ptr [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
-; SSE-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
-; SSE-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
-; SSE-NEXT:    [[ADD_6:%.*]] = add i8 [[TMP13]], [[TMP12]]
-; SSE-NEXT:    store i8 [[ADD_6]], ptr [[ARRAYIDX2_6]], align 1
-; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
-; SSE-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
-; SSE-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
-; SSE-NEXT:    [[ADD_7:%.*]] = add i8 [[TMP15]], [[TMP14]]
-; SSE-NEXT:    store i8 [[ADD_7]], ptr [[ARRAYIDX2_7]], align 1
+; SSE-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = add <8 x i8> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store <8 x i8> [[TMP2]], ptr [[R]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add8(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
-; AVX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
-; AVX-NEXT:    [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]]
-; AVX-NEXT:    store i8 [[ADD]], ptr [[R]], align 1
-; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
-; AVX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
-; AVX-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP3]], [[TMP2]]
-; AVX-NEXT:    store i8 [[ADD_1]], ptr [[ARRAYIDX2_1]], align 1
-; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
-; AVX-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
-; AVX-NEXT:    [[ADD_2:%.*]] = add i8 [[TMP5]], [[TMP4]]
-; AVX-NEXT:    store i8 [[ADD_2]], ptr [[ARRAYIDX2_2]], align 1
-; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
-; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
-; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
-; AVX-NEXT:    [[ADD_3:%.*]] = add i8 [[TMP7]], [[TMP6]]
-; AVX-NEXT:    store i8 [[ADD_3]], ptr [[ARRAYIDX2_3]], align 1
-; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
-; AVX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
-; AVX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
-; AVX-NEXT:    [[ADD_4:%.*]] = add i8 [[TMP9]], [[TMP8]]
-; AVX-NEXT:    store i8 [[ADD_4]], ptr [[ARRAYIDX2_4]], align 1
-; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
-; AVX-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
-; AVX-NEXT:    [[ADD_5:%.*]] = add i8 [[TMP11]], [[TMP10]]
-; AVX-NEXT:    store i8 [[ADD_5]], ptr [[ARRAYIDX2_5]], align 1
-; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
-; AVX-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
-; AVX-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
-; AVX-NEXT:    [[ADD_6:%.*]] = add i8 [[TMP13]], [[TMP12]]
-; AVX-NEXT:    store i8 [[ADD_6]], ptr [[ARRAYIDX2_6]], align 1
-; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
-; AVX-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
-; AVX-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
-; AVX-NEXT:    [[ADD_7:%.*]] = add i8 [[TMP15]], [[TMP14]]
-; AVX-NEXT:    store i8 [[ADD_7]], ptr [[ARRAYIDX2_7]], align 1
+; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = add <8 x i8> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store <8 x i8> [[TMP2]], ptr [[R]], align 1
 ; AVX-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-and-const-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-and-const-load.ll
index a75f94578cdba..73fde350248bc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-and-const-load.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-and-const-load.ll
@@ -81,86 +81,16 @@ entry:
 define void @and8(ptr noalias nocapture noundef writeonly %dst, ptr noalias nocapture noundef readonly %src) {
 ; SSE-LABEL: @and8(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
-; SSE-NEXT:    [[TMP1:%.*]] = and i8 [[TMP0]], -64
-; SSE-NEXT:    store i8 [[TMP1]], ptr [[DST:%.*]], align 1
-; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
-; SSE-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
-; SSE-NEXT:    [[TMP3:%.*]] = and i8 [[TMP2]], -64
-; SSE-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 1
-; SSE-NEXT:    store i8 [[TMP3]], ptr [[ARRAYIDX3_1]], align 1
-; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 2
-; SSE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
-; SSE-NEXT:    [[TMP5:%.*]] = and i8 [[TMP4]], -64
-; SSE-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 2
-; SSE-NEXT:    store i8 [[TMP5]], ptr [[ARRAYIDX3_2]], align 1
-; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 3
-; SSE-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
-; SSE-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -64
-; SSE-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 3
-; SSE-NEXT:    store i8 [[TMP7]], ptr [[ARRAYIDX3_3]], align 1
-; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; SSE-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
-; SSE-NEXT:    [[TMP9:%.*]] = and i8 [[TMP8]], -64
-; SSE-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 4
-; SSE-NEXT:    store i8 [[TMP9]], ptr [[ARRAYIDX3_4]], align 1
-; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 5
-; SSE-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
-; SSE-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -64
-; SSE-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 5
-; SSE-NEXT:    store i8 [[TMP11]], ptr [[ARRAYIDX3_5]], align 1
-; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 6
-; SSE-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
-; SSE-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], -64
-; SSE-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 6
-; SSE-NEXT:    store i8 [[TMP13]], ptr [[ARRAYIDX3_6]], align 1
-; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 7
-; SSE-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
-; SSE-NEXT:    [[TMP15:%.*]] = and i8 [[TMP14]], -64
-; SSE-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 7
-; SSE-NEXT:    store i8 [[TMP15]], ptr [[ARRAYIDX3_7]], align 1
+; SSE-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = and <8 x i8> [[TMP0]], <i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64>
+; SSE-NEXT:    store <8 x i8> [[TMP1]], ptr [[DST:%.*]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @and8(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
-; AVX-NEXT:    [[TMP1:%.*]] = and i8 [[TMP0]], -64
-; AVX-NEXT:    store i8 [[TMP1]], ptr [[DST:%.*]], align 1
-; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
-; AVX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
-; AVX-NEXT:    [[TMP3:%.*]] = and i8 [[TMP2]], -64
-; AVX-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 1
-; AVX-NEXT:    store i8 [[TMP3]], ptr [[ARRAYIDX3_1]], align 1
-; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
-; AVX-NEXT:    [[TMP5:%.*]] = and i8 [[TMP4]], -64
-; AVX-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 2
-; AVX-NEXT:    store i8 [[TMP5]], ptr [[ARRAYIDX3_2]], align 1
-; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 3
-; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
-; AVX-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -64
-; AVX-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 3
-; AVX-NEXT:    store i8 [[TMP7]], ptr [[ARRAYIDX3_3]], align 1
-; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
-; AVX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
-; AVX-NEXT:    [[TMP9:%.*]] = and i8 [[TMP8]], -64
-; AVX-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 4
-; AVX-NEXT:    store i8 [[TMP9]], ptr [[ARRAYIDX3_4]], align 1
-; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
-; AVX-NEXT:    [[TMP11:%.*]] = and i8 [[TMP10]], -64
-; AVX-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 5
-; AVX-NEXT:    store i8 [[TMP11]], ptr [[ARRAYIDX3_5]], align 1
-; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 6
-; AVX-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
-; AVX-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], -64
-; AVX-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 6
-; AVX-NEXT:    store i8 [[TMP13]], ptr [[ARRAYIDX3_6]], align 1
-; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 7
-; AVX-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
-; AVX-NEXT:    [[TMP15:%.*]] = and i8 [[TMP14]], -64
-; AVX-NEXT:    [[ARRAYIDX3_7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 7
-; AVX-NEXT:    store i8 [[TMP15]], ptr [[ARRAYIDX3_7]], align 1
+; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[SRC:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = and <8 x i8> [[TMP0]], <i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64>
+; AVX-NEXT:    store <8 x i8> [[TMP1]], ptr [[DST:%.*]], align 1
 ; AVX-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll
index 42463832c8831..46dc722660c03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-load.ll
@@ -93,102 +93,18 @@ entry:
 define void @add8(ptr noalias nocapture noundef %r, ptr noalias nocapture noundef readonly %a) {
 ; SSE-LABEL: @add8(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
-; SSE-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
-; SSE-NEXT:    [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
-; SSE-NEXT:    store i8 [[MUL]], ptr [[R]], align 1
-; SSE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
-; SSE-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
-; SSE-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
-; SSE-NEXT:    [[MUL_1:%.*]] = mul i8 [[TMP3]], [[TMP2]]
-; SSE-NEXT:    store i8 [[MUL_1]], ptr [[ARRAYIDX2_1]], align 1
-; SSE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
-; SSE-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
-; SSE-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
-; SSE-NEXT:    [[MUL_2:%.*]] = mul i8 [[TMP5]], [[TMP4]]
-; SSE-NEXT:    store i8 [[MUL_2]], ptr [[ARRAYIDX2_2]], align 1
-; SSE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
-; SSE-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
-; SSE-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
-; SSE-NEXT:    [[MUL_3:%.*]] = mul i8 [[TMP7]], [[TMP6]]
-; SSE-NEXT:    store i8 [[MUL_3]], ptr [[ARRAYIDX2_3]], align 1
-; SSE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
-; SSE-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
-; SSE-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
-; SSE-NEXT:    [[MUL_4:%.*]] = mul i8 [[TMP9]], [[TMP8]]
-; SSE-NEXT:    store i8 [[MUL_4]], ptr [[ARRAYIDX2_4]], align 1
-; SSE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
-; SSE-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
-; SSE-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[MUL_5:%.*]] = mul i8 [[TMP11]], [[TMP10]]
-; SSE-NEXT:    store i8 [[MUL_5]], ptr [[ARRAYIDX2_5]], align 1
-; SSE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
-; SSE-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
-; SSE-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
-; SSE-NEXT:    [[MUL_6:%.*]] = mul i8 [[TMP13]], [[TMP12]]
-; SSE-NEXT:    store i8 [[MUL_6]], ptr [[ARRAYIDX2_6]], align 1
-; SSE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
-; SSE-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
-; SSE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
-; SSE-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
-; SSE-NEXT:    [[MUL_7:%.*]] = mul i8 [[TMP15]], [[TMP14]]
-; SSE-NEXT:    store i8 [[MUL_7]], ptr [[ARRAYIDX2_7]], align 1
+; SSE-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A:%.*]], align 1
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[R:%.*]], align 1
+; SSE-NEXT:    [[TMP2:%.*]] = mul <8 x i8> [[TMP1]], [[TMP0]]
+; SSE-NEXT:    store <8 x i8> [[TMP2]], ptr [[R]], align 1
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @add8(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1
-; AVX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[R:%.*]], align 1
-; AVX-NEXT:    [[MUL:%.*]] = mul i8 [[TMP1]], [[TMP0]]
-; AVX-NEXT:    store i8 [[MUL]], ptr [[R]], align 1
-; AVX-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1
-; AVX-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX2_1]], align 1
-; AVX-NEXT:    [[MUL_1:%.*]] = mul i8 [[TMP3]], [[TMP2]]
-; AVX-NEXT:    store i8 [[MUL_1]], ptr [[ARRAYIDX2_1]], align 1
-; AVX-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 2
-; AVX-NEXT:    [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX2_2]], align 1
-; AVX-NEXT:    [[MUL_2:%.*]] = mul i8 [[TMP5]], [[TMP4]]
-; AVX-NEXT:    store i8 [[MUL_2]], ptr [[ARRAYIDX2_2]], align 1
-; AVX-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3
-; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 3
-; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX2_3]], align 1
-; AVX-NEXT:    [[MUL_3:%.*]] = mul i8 [[TMP7]], [[TMP6]]
-; AVX-NEXT:    store i8 [[MUL_3]], ptr [[ARRAYIDX2_3]], align 1
-; AVX-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
-; AVX-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 4
-; AVX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX2_4]], align 1
-; AVX-NEXT:    [[MUL_4:%.*]] = mul i8 [[TMP9]], [[TMP8]]
-; AVX-NEXT:    store i8 [[MUL_4]], ptr [[ARRAYIDX2_4]], align 1
-; AVX-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5
-; AVX-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 5
-; AVX-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX2_5]], align 1
-; AVX-NEXT:    [[MUL_5:%.*]] = mul i8 [[TMP11]], [[TMP10]]
-; AVX-NEXT:    store i8 [[MUL_5]], ptr [[ARRAYIDX2_5]], align 1
-; AVX-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6
-; AVX-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 6
-; AVX-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX2_6]], align 1
-; AVX-NEXT:    [[MUL_6:%.*]] = mul i8 [[TMP13]], [[TMP12]]
-; AVX-NEXT:    store i8 [[MUL_6]], ptr [[ARRAYIDX2_6]], align 1
-; AVX-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7
-; AVX-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
-; AVX-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds i8, ptr [[R]], i64 7
-; AVX-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX2_7]], align 1
-; AVX-NEXT:    [[MUL_7:%.*]] = mul i8 [[TMP15]], [[TMP14]]
-; AVX-NEXT:    store i8 [[MUL_7]], ptr [[ARRAYIDX2_7]], align 1
+; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A:%.*]], align 1
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[R:%.*]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = mul <8 x i8> [[TMP1]], [[TMP0]]
+; AVX-NEXT:    store <8 x i8> [[TMP2]], ptr [[R]], align 1
 ; AVX-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
index 59bc984118fab..c9590ae76f405 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
@@ -11,27 +11,23 @@ define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145.
 ; CHECK-LABEL: @LzmaDec_DecodeReal2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P:%.*]], i64 0, i32 4
-; CHECK-NEXT:    [[CODE21_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P]], i64 0, i32 5
 ; CHECK-NEXT:    br label [[DO_BODY66_I:%.*]]
 ; CHECK:       do.body66.i:
-; CHECK-NEXT:    [[RANGE_2_I:%.*]] = phi i32 [ [[RANGE_4_I:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[CODE_2_I:%.*]] = phi i32 [ [[CODE_4_I:%.*]], [[DO_COND_I]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[DOTRANGE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_2_I]]
-; CHECK-NEXT:    [[DOTCODE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_2_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ [[TMP5:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> <i32 undef, i32 poison>, i32 [[TMP2]], i32 1
 ; CHECK-NEXT:    br i1 undef, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]]
 ; CHECK:       if.else.i:
-; CHECK-NEXT:    [[SUB91_I:%.*]] = sub i32 [[DOTRANGE_2_I]], undef
-; CHECK-NEXT:    [[SUB92_I:%.*]] = sub i32 [[DOTCODE_2_I]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], undef
 ; CHECK-NEXT:    br label [[DO_COND_I]]
 ; CHECK:       do.cond.i:
-; CHECK-NEXT:    [[RANGE_4_I]] = phi i32 [ [[SUB91_I]], [[IF_ELSE_I]] ], [ undef, [[DO_BODY66_I]] ]
-; CHECK-NEXT:    [[CODE_4_I]] = phi i32 [ [[SUB92_I]], [[IF_ELSE_I]] ], [ [[DOTCODE_2_I]], [[DO_BODY66_I]] ]
+; CHECK-NEXT:    [[TMP5]] = phi <2 x i32> [ [[TMP4]], [[IF_ELSE_I]] ], [ [[TMP3]], [[DO_BODY66_I]] ]
 ; CHECK-NEXT:    br i1 undef, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]]
 ; CHECK:       do.end1006.i:
-; CHECK-NEXT:    [[DOTRANGE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_4_I]]
-; CHECK-NEXT:    [[DOTCODE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_4_I]]
-; CHECK-NEXT:    store i32 [[DOTRANGE_4_I]], i32* [[RANGE20_I]], align 4
-; CHECK-NEXT:    store i32 [[DOTCODE_4_I]], i32* [[CODE21_I]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[RANGE20_I]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
index bbce681cdfb7f..c33928a604484 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
@@ -14,23 +14,18 @@ define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btCons
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.else:
 ; CHECK-NEXT:    [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1
 ; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]]
 ; CHECK:       land.lhs.true.i.1:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]]
 ; CHECK:       if.then7.1:
-; CHECK-NEXT:    [[INC_1:%.*]] = add nsw i32 0, 1
-; CHECK-NEXT:    store i32 [[INC_1]], i32* [[M_NUMCONSTRAINTROWS4]], align 4
-; CHECK-NEXT:    [[DEC_1:%.*]] = add nsw i32 6, -1
-; CHECK-NEXT:    store i32 [[DEC_1]], i32* [[NUB5]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> <i32 1, i32 5>, <2 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC_1]]
 ; CHECK:       for.inc.1:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[DEC_1]], [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[INC_1]], [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ]
-; CHECK-NEXT:    [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1
-; CHECK-NEXT:    store i32 [[INC_2]], i32* [[M_NUMCONSTRAINTROWS4]], align 4
-; CHECK-NEXT:    [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT:    store i32 [[DEC_2]], i32* [[NUB5]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ <i32 1, i32 5>, [[IF_THEN7_1]] ], [ <i32 0, i32 6>, [[LAND_LHS_TRUE_I_1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -74,15 +69,14 @@ define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector
 ; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332:%.*]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS:%.*]], i64 0, i32 2, i64 0, i32 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS]], i64 0, i32 2, i64 0, i32 0, i64 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX36]], align 4
-; CHECK-NEXT:    [[ADD587:%.*]] = fadd float undef, undef
-; CHECK-NEXT:    [[SUB600:%.*]] = fsub float [[ADD587]], undef
-; CHECK-NEXT:    store float [[SUB600]], float* undef, align 4
-; CHECK-NEXT:    [[SUB613:%.*]] = fsub float [[ADD587]], [[SUB600]]
-; CHECK-NEXT:    store float [[SUB613]], float* [[ARRAYIDX26]], align 4
-; CHECK-NEXT:    [[ADD626:%.*]] = fadd float [[TMP0]], undef
-; CHECK-NEXT:    [[SUB639:%.*]] = fsub float [[ADD626]], undef
-; CHECK-NEXT:    [[SUB652:%.*]] = fsub float [[ADD626]], [[SUB639]]
-; CHECK-NEXT:    store float [[SUB652]], float* [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float undef, float poison>, float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], undef
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    store float [[TMP4]], float* undef, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX26]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP5]], <2 x float>* [[TMP6]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[IF_ELSE1609:%.*]], label [[IF_THEN1595:%.*]]
 ; CHECK:       if.then1595:
 ; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[FOR_BODY_LR_PH_I_I1702:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
index abf241a810541..30abc6f9fbe87 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -24,34 +24,29 @@ define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(%class.
 ; CHECK:       for.body233:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_BODY233]], label [[FOR_END271]]
 ; CHECK:       for.end271:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ]
-; CHECK-NEXT:    [[SUB275:%.*]] = fsub float undef, [[TMP1]]
-; CHECK-NEXT:    [[SUB279:%.*]] = fsub float undef, [[TMP0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ <float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000>, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> undef, [[TMP0]]
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN291:%.*]], label [[RETURN]]
 ; CHECK:       if.then291:
-; CHECK-NEXT:    [[MUL292:%.*]] = fmul float [[SUB275]], 5.000000e-01
-; CHECK-NEXT:    [[ADD294:%.*]] = fadd float [[TMP1]], [[MUL292]]
-; CHECK-NEXT:    [[MUL295:%.*]] = fmul float [[SUB279]], 5.000000e-01
-; CHECK-NEXT:    [[ADD297:%.*]] = fadd float [[TMP0]], [[MUL295]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 5.000000e-01, float 5.000000e-01>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 undef, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]]
 ; CHECK:       if.else319:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]]
 ; CHECK:       if.then325:
 ; CHECK-NEXT:    br label [[IF_END327]]
 ; CHECK:       if.end327:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> <float poison, float undef>, float [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN329:%.*]], label [[IF_END332]]
 ; CHECK:       if.then329:
 ; CHECK-NEXT:    br label [[IF_END332]]
 ; CHECK:       if.end332:
-; CHECK-NEXT:    [[DX272_1:%.*]] = phi float [ [[SUB275]], [[IF_THEN329]] ], [ [[SUB275]], [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ]
-; CHECK-NEXT:    [[DY276_1:%.*]] = phi float [ undef, [[IF_THEN329]] ], [ undef, [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ]
-; CHECK-NEXT:    [[SUB334:%.*]] = fsub float [[ADD294]], [[DX272_1]]
-; CHECK-NEXT:    [[SUB338:%.*]] = fsub float [[ADD297]], [[DY276_1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[IF_THEN329]] ], [ [[TMP5]], [[IF_END327]] ], [ <float 0x3F847AE140000000, float 0x3F847AE140000000>, [[IF_THEN291]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[ARRAYIDX_I_I606:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113:%.*]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES:%.*]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    store float [[SUB334]], float* [[ARRAYIDX_I_I606]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_I607:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    store float [[SUB338]], float* [[ARRAYIDX3_I607]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_I_I606]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP7]], <2 x float>* [[TMP8]], align 4
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
index b4bc79a5247d2..38ccecd0e0791 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
@@ -27,25 +27,24 @@ define void @SIM4() {
 ; CHECK:       land.rhs.lr.ph:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end98:
-; CHECK-NEXT:    [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1
 ; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]]
 ; CHECK:       if.then103:
+; CHECK-NEXT:    [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0
 ; CHECK-NEXT:    [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef
 ; CHECK-NEXT:    [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2
-; CHECK-NEXT:    [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0
 ; CHECK-NEXT:    [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[COND125]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[DOTSUB100]], i32 1
 ; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 ; CHECK:       for.cond.i:
-; CHECK-NEXT:    [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ]
-; CHECK-NEXT:    [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ undef, [[LAND_RHS_I874:%.*]] ], [ [[TMP1]], [[IF_THEN103]] ]
 ; CHECK-NEXT:    br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]]
 ; CHECK:       land.rhs.i874:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]]
 ; CHECK:       for.end.i:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]]
 ; CHECK:       if.then.i:
-; CHECK-NEXT:    [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef
-; CHECK-NEXT:    [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], undef
 ; CHECK-NEXT:    br label [[EXTEND_BW_EXIT:%.*]]
 ; CHECK:       if.end.i:
 ; CHECK-NEXT:    [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]]
@@ -66,14 +65,12 @@ define void @SIM4() {
 ; CHECK:       while.end275.i:
 ; CHECK-NEXT:    br label [[EXTEND_BW_EXIT]]
 ; CHECK:       extend_bw.exit:
-; CHECK-NEXT:    [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ]
-; CHECK-NEXT:    [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ [[TMP3]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ]
 ; CHECK-NEXT:    br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]]
 ; CHECK:       if.then157:
-; CHECK-NEXT:    [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1
-; CHECK-NEXT:    store i32 [[ADD158]], i32* [[FROM299]], align 4
-; CHECK-NEXT:    [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1
-; CHECK-NEXT:    store i32 [[ADD160]], i32* [[FROM1115]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[FROM1115]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    br label [[LAND_LHS_TRUE167]]
 ; CHECK:       land.lhs.true167:
 ; CHECK-NEXT:    unreachable

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
index aa4cd49e4649e..13ad3612ed90c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@@ -195,30 +195,9 @@ define void @fptosi_8f64_8i16() #0 {
 
 define void @fptosi_8f64_8i8() #0 {
 ; CHECK-LABEL: @fptosi_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
@@ -428,30 +407,9 @@ define void @fptosi_8f32_8i16() #0 {
 
 define void @fptosi_8f32_8i8() #0 {
 ; CHECK-LABEL: @fptosi_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
index 3490fa909486e..27878a0b0438c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -195,30 +195,9 @@ define void @fptosi_8f64_8i16() #0 {
 
 define void @fptosi_8f64_8i8() #0 {
 ; CHECK-LABEL: @fptosi_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
@@ -428,30 +407,9 @@ define void @fptosi_8f32_8i16() #0 {
 
 define void @fptosi_8f32_8i8() #0 {
 ; CHECK-LABEL: @fptosi_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
index b3698f260f360..1225024dc16cf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -195,30 +195,9 @@ define void @fptoui_8f64_8i16() #0 {
 
 define void @fptoui_8f64_8i8() #0 {
 ; CHECK-LABEL: @fptoui_8f64_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
@@ -428,30 +407,9 @@ define void @fptoui_8f32_8i16() #0 {
 
 define void @fptoui_8f32_8i8() #0 {
 ; CHECK-LABEL: @fptoui_8f32_8i8(
-; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
-; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
-; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
-; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
-; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
-; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
-; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
-; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
-; CHECK-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i8
-; CHECK-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i8
-; CHECK-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i8
-; CHECK-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i8
-; CHECK-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i8
-; CHECK-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i8
-; CHECK-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i8
-; CHECK-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i8
-; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
-; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
-; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
-; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
-; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
-; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
-; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
-; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i8>
+; CHECK-NEXT:    store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 66b7d972fb92f..e0036d0568d67 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -143,15 +143,11 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; PR41892
 define void @test_v4f32_v2f32_store(<4 x float> %f, float* %p){
 ; CHECK-LABEL: @test_v4f32_v2f32_store(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[F:%.*]], i64 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[F]], i64 1
-; CHECK-NEXT:    [[ADD01:%.*]] = fadd float [[X0]], [[X1]]
-; CHECK-NEXT:    store float [[ADD01]], float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x float> [[F]], i64 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x float> [[F]], i64 3
-; CHECK-NEXT:    [[ADD23:%.*]] = fadd float [[X2]], [[X3]]
-; CHECK-NEXT:    [[P23:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
-; CHECK-NEXT:    store float [[ADD23]], float* [[P23]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[P:%.*]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %x0 = extractelement <4 x float> %f, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index 6eb51c56c070a..7f7153d30a036 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -143,15 +143,11 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; PR41892
 define void @test_v4f32_v2f32_store(<4 x float> %f, float* %p){
 ; CHECK-LABEL: @test_v4f32_v2f32_store(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[F:%.*]], i64 0
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[F]], i64 1
-; CHECK-NEXT:    [[ADD01:%.*]] = fadd float [[X0]], [[X1]]
-; CHECK-NEXT:    store float [[ADD01]], float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x float> [[F]], i64 2
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x float> [[F]], i64 3
-; CHECK-NEXT:    [[ADD23:%.*]] = fadd float [[X2]], [[X3]]
-; CHECK-NEXT:    [[P23:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
-; CHECK-NEXT:    store float [[ADD23]], float* [[P23]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[F:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[F]], <4 x float> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[P:%.*]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %x0 = extractelement <4 x float> %f, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
index f00ecd3241850..63c5b19a1ff40 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -22,147 +22,43 @@ entry:
 }
 
 define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i8* noalias nocapture readonly %d, i8* noalias nocapture %e, i32 %w) local_unnamed_addr #1 {
-; SSE-LABEL: @bar(
-; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[W:%.*]], i32 0
-; SSE-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[W]], i32 0
-; SSE-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[W]], i32 0
-; SSE-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[W]], i32 0
-; SSE-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:    br label [[FOR_BODY:%.*]]
-; SSE:       for.body:
-; SSE-NEXT:    [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
-; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[C_ADDR_0352]] to <4 x i8>*
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; SSE-NEXT:    [[TMP6:%.*]] = bitcast i8* [[D_ADDR_0353]] to <4 x i8>*
-; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1
-; SSE-NEXT:    [[TMP8:%.*]] = bitcast i8* [[A_ADDR_0355]] to <4 x i8>*
-; SSE-NEXT:    [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1
-; SSE-NEXT:    [[TMP10:%.*]] = bitcast i8* [[B_ADDR_0351]] to <4 x i8>*
-; SSE-NEXT:    [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1
-; SSE-NEXT:    [[TMP12:%.*]] = icmp ult <4 x i8> [[TMP5]], [[TMP7]]
-; SSE-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i8> [[TMP11]], <4 x i8> [[TMP9]]
-; SSE-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32>
-; SSE-NEXT:    [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[SHUFFLE]]
-; SSE-NEXT:    [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i8>
-; SSE-NEXT:    [[TMP17:%.*]] = bitcast i8* [[E_ADDR_0354]] to <4 x i8>*
-; SSE-NEXT:    store <4 x i8> [[TMP16]], <4 x i8>* [[TMP17]], align 1
-; SSE-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
-; SSE-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
-; SSE-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
-; SSE-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
-; SSE-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
-; SSE-NEXT:    [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX45]] to <4 x i8>*
-; SSE-NEXT:    [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1
-; SSE-NEXT:    [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX47]] to <4 x i8>*
-; SSE-NEXT:    [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1
-; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX49]] to <4 x i8>*
-; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
-; SSE-NEXT:    [[TMP24:%.*]] = bitcast i8* [[ARRAYIDX52]] to <4 x i8>*
-; SSE-NEXT:    [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1
-; SSE-NEXT:    [[TMP26:%.*]] = icmp ult <4 x i8> [[TMP19]], [[TMP21]]
-; SSE-NEXT:    [[TMP27:%.*]] = select <4 x i1> [[TMP26]], <4 x i8> [[TMP25]], <4 x i8> [[TMP23]]
-; SSE-NEXT:    [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32>
-; SSE-NEXT:    [[TMP29:%.*]] = mul <4 x i32> [[TMP28]], [[SHUFFLE1]]
-; SSE-NEXT:    [[TMP30:%.*]] = trunc <4 x i32> [[TMP29]] to <4 x i8>
-; SSE-NEXT:    [[TMP31:%.*]] = bitcast i8* [[ARRAYIDX56]] to <4 x i8>*
-; SSE-NEXT:    store <4 x i8> [[TMP30]], <4 x i8>* [[TMP31]], align 1
-; SSE-NEXT:    [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
-; SSE-NEXT:    [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
-; SSE-NEXT:    [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
-; SSE-NEXT:    [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
-; SSE-NEXT:    [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
-; SSE-NEXT:    [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX93]] to <4 x i8>*
-; SSE-NEXT:    [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1
-; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX95]] to <4 x i8>*
-; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
-; SSE-NEXT:    [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX97]] to <4 x i8>*
-; SSE-NEXT:    [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1
-; SSE-NEXT:    [[TMP38:%.*]] = bitcast i8* [[ARRAYIDX100]] to <4 x i8>*
-; SSE-NEXT:    [[TMP39:%.*]] = load <4 x i8>, <4 x i8>* [[TMP38]], align 1
-; SSE-NEXT:    [[TMP40:%.*]] = icmp ult <4 x i8> [[TMP33]], [[TMP35]]
-; SSE-NEXT:    [[TMP41:%.*]] = select <4 x i1> [[TMP40]], <4 x i8> [[TMP39]], <4 x i8> [[TMP37]]
-; SSE-NEXT:    [[TMP42:%.*]] = zext <4 x i8> [[TMP41]] to <4 x i32>
-; SSE-NEXT:    [[TMP43:%.*]] = mul <4 x i32> [[TMP42]], [[SHUFFLE2]]
-; SSE-NEXT:    [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i8>
-; SSE-NEXT:    [[TMP45:%.*]] = bitcast i8* [[ARRAYIDX104]] to <4 x i8>*
-; SSE-NEXT:    store <4 x i8> [[TMP44]], <4 x i8>* [[TMP45]], align 1
-; SSE-NEXT:    [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
-; SSE-NEXT:    [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
-; SSE-NEXT:    [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
-; SSE-NEXT:    [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
-; SSE-NEXT:    [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
-; SSE-NEXT:    [[TMP46:%.*]] = bitcast i8* [[ARRAYIDX141]] to <4 x i8>*
-; SSE-NEXT:    [[TMP47:%.*]] = load <4 x i8>, <4 x i8>* [[TMP46]], align 1
-; SSE-NEXT:    [[TMP48:%.*]] = bitcast i8* [[ARRAYIDX143]] to <4 x i8>*
-; SSE-NEXT:    [[TMP49:%.*]] = load <4 x i8>, <4 x i8>* [[TMP48]], align 1
-; SSE-NEXT:    [[TMP50:%.*]] = bitcast i8* [[ARRAYIDX145]] to <4 x i8>*
-; SSE-NEXT:    [[TMP51:%.*]] = load <4 x i8>, <4 x i8>* [[TMP50]], align 1
-; SSE-NEXT:    [[TMP52:%.*]] = bitcast i8* [[ARRAYIDX148]] to <4 x i8>*
-; SSE-NEXT:    [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1
-; SSE-NEXT:    [[TMP54:%.*]] = icmp ult <4 x i8> [[TMP47]], [[TMP49]]
-; SSE-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP54]], <4 x i8> [[TMP53]], <4 x i8> [[TMP51]]
-; SSE-NEXT:    [[TMP56:%.*]] = zext <4 x i8> [[TMP55]] to <4 x i32>
-; SSE-NEXT:    [[TMP57:%.*]] = mul <4 x i32> [[TMP56]], [[SHUFFLE3]]
-; SSE-NEXT:    [[TMP58:%.*]] = trunc <4 x i32> [[TMP57]] to <4 x i8>
-; SSE-NEXT:    [[TMP59:%.*]] = bitcast i8* [[ARRAYIDX152]] to <4 x i8>*
-; SSE-NEXT:    store <4 x i8> [[TMP58]], <4 x i8>* [[TMP59]], align 1
-; SSE-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
-; SSE-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
-; SSE-NEXT:    [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
-; SSE-NEXT:    [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
-; SSE-NEXT:    [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
-; SSE-NEXT:    [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
-; SSE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
-; SSE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; SSE:       for.end:
-; SSE-NEXT:    ret void
-;
-; AVX512-LABEL: @bar(
-; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[W:%.*]], i32 0
-; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> zeroinitializer
-; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX512:       for.body:
-; AVX512-NEXT:    [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; AVX512-NEXT:    [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; AVX512-NEXT:    [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
-; AVX512-NEXT:    [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
-; AVX512-NEXT:    [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
-; AVX512-NEXT:    [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
-; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
-; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
-; AVX512-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1
-; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
-; AVX512-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
-; AVX512-NEXT:    [[TMP9:%.*]] = icmp ult <16 x i8> [[TMP2]], [[TMP4]]
-; AVX512-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP6]]
-; AVX512-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
-; AVX512-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP11]], [[SHUFFLE]]
-; AVX512-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
-; AVX512-NEXT:    [[TMP14:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
-; AVX512-NEXT:    store <16 x i8> [[TMP13]], <16 x i8>* [[TMP14]], align 1
-; AVX512-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
-; AVX512-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
-; AVX512-NEXT:    [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
-; AVX512-NEXT:    [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
-; AVX512-NEXT:    [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
-; AVX512-NEXT:    [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
-; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; AVX512:       for.end:
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[W:%.*]], i32 0
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <16 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP11]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP13]], <16 x i8>* [[TMP14]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
+; CHECK-NEXT:    [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
+; CHECK-NEXT:    [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
+; CHECK-NEXT:    [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
+; CHECK-NEXT:    [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
index 8c834ef93d13b..820d8e0888ef1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
@@ -78,9 +78,8 @@ define void @delete_pointer_bound(float* %a, float* %b, i1 %c) #0 {
 ; CHECK-NEXT:    [[A_5:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
 ; CHECK-NEXT:    store float [[L6]], float* [[A_5]], align 4
 ; CHECK-NEXT:    [[A_6:%.*]] = getelementptr inbounds float, float* [[A]], i64 6
-; CHECK-NEXT:    store float 0.000000e+00, float* [[A_6]], align 4
-; CHECK-NEXT:    [[A_7:%.*]] = getelementptr inbounds float, float* [[A]], i64 7
-; CHECK-NEXT:    store float 0.000000e+00, float* [[A_7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A_6]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> zeroinitializer, <2 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -219,30 +218,27 @@ define void @gather_sequence_crash(<2 x float> %arg, float* %arg1, float %arg2,
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP23]], i64 6
 ; CHECK-NEXT:    store float 0.000000e+00, float* [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[ARG5:%.*]], align 4
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP23]], i64 5
 ; CHECK-NEXT:    [[TMP29:%.*]] = fadd float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    store float 0.000000e+00, float* [[TMP26]], align 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP23]], i64 4
-; CHECK-NEXT:    store float 0.000000e+00, float* [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = fadd float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    store float 0.000000e+00, float* [[TMP30]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[TMP30]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> zeroinitializer, <2 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[ARG4]], i32 0
 ; CHECK-NEXT:    br label [[BB33:%.*]]
 ; CHECK:       bb33:
 ; CHECK-NEXT:    br label [[BB34:%.*]]
 ; CHECK:       bb34:
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr float, float* [[TMP32]], i64 3
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr float, float* [[TMP32]], i64 2
 ; CHECK-NEXT:    [[TMP37:%.*]] = load float, float* [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP38:%.*]] = fadd float 0.000000e+00, [[TMP37]]
 ; CHECK-NEXT:    store float [[TMP38]], float* [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr float, float* [[TMP32]], i64 1
-; CHECK-NEXT:    [[TMP40:%.*]] = load float, float* [[TMP36]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = fadd float 0.000000e+00, [[TMP40]]
-; CHECK-NEXT:    store float [[TMP41]], float* [[TMP36]], align 4
-; CHECK-NEXT:    [[TMP42:%.*]] = load float, float* [[TMP39]], align 4
-; CHECK-NEXT:    [[TMP43:%.*]] = fadd float 0.000000e+00, [[TMP42]]
-; CHECK-NEXT:    store float [[TMP43]], float* [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP39]] to <2 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP39]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP8]], <2 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP44:%.*]] = load float, float* [[ARG3:%.*]], align 4
 ; CHECK-NEXT:    [[TMP45:%.*]] = load float, float* [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP46:%.*]] = fadd float 0.000000e+00, [[TMP45]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
index c0ed976fd702f..a53c89b407fdd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -5,27 +5,25 @@ define void @test_add_sdiv(i32 *%arr1, i32 *%arr2, i32 %a0, i32 %a1, i32 %a2, i3
 ; CHECK-LABEL: @test_add_sdiv(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr i32, i32* [[ARR1:%.*]], i32 0
-; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr i32, i32* [[ARR1]], i32 1
 ; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr i32, i32* [[ARR1]], i32 2
 ; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr i32, i32* [[ARR1]], i32 3
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr i32, i32* [[ARR2:%.*]], i32 0
-; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1
 ; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2
 ; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3
-; CHECK-NEXT:    [[V0:%.*]] = load i32, i32* [[GEP1_0]], align 4
-; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[GEP1_1]], align 4
 ; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4
 ; CHECK-NEXT:    [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146
-; CHECK-NEXT:    [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 1146, i32 146>
 ; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
 ; CHECK-NEXT:    [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
-; CHECK-NEXT:    [[RES0:%.*]] = add nsw i32 [[V0]], [[Y0]]
-; CHECK-NEXT:    [[RES1:%.*]] = add nsw i32 [[V1]], [[Y1]]
 ; CHECK-NEXT:    [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]]
 ; CHECK-NEXT:    [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
-; CHECK-NEXT:    store i32 [[RES0]], i32* [[GEP2_0]], align 4
-; CHECK-NEXT:    store i32 [[RES1]], i32* [[GEP2_1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[GEP1_0]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP2_0]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    store i32 [[RES2]], i32* [[GEP2_2]], align 4
 ; CHECK-NEXT:    store i32 [[RES3]], i32* [[GEP2_3]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
index 39df2d2029ede..8e0dbe675ab89 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
@@ -65,9 +65,10 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, float* %p){
 ; CHECK-LABEL: @test_v4f32_v2f32_store(
 ; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[F:%.*]], i64 0
 ; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[F]], i64 1
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
-; CHECK-NEXT:    store float [[X0]], float* [[P]], align 4
-; CHECK-NEXT:    store float [[X1]], float* [[P1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[X0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[X1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P:%.*]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %x0 = extractelement <4 x float> %f, i64 0
@@ -98,10 +99,11 @@ define void @test_v4f32_v3f32_store(<4 x float> %f, float* %p){
 ; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x float> [[F:%.*]], i64 0
 ; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[F]], i64 1
 ; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x float> [[F]], i64 2
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
-; CHECK-NEXT:    store float [[X0]], float* [[P]], align 4
-; CHECK-NEXT:    store float [[X1]], float* [[P1]], align 4
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[X0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[X1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    store float [[X2]], float* [[P2]], align 4
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll
index ac82e47633d5a..fc61952acab6e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49933.ll
@@ -1,63 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-- -mcpu=skylake-avx512 | FileCheck %s
-; These code should be fully vectorized by D57059 patch
 
 define void @foo(i8* noalias nocapture %t0, i8* noalias nocapture readonly %t1) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[T1:%.*]], align 1, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[T4:%.*]] = icmp ult i8 [[T3]], 64
-; CHECK-NEXT:    [[T5:%.*]] = sub i8 0, [[T3]]
-; CHECK-NEXT:    [[T6:%.*]] = select i1 [[T4]], i8 [[T3]], i8 [[T5]]
-; CHECK-NEXT:    store i8 [[T6]], i8* [[T0:%.*]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T7:%.*]] = getelementptr inbounds i8, i8* [[T1]], i64 1
-; CHECK-NEXT:    [[T8:%.*]] = load i8, i8* [[T7]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T9:%.*]] = icmp ult i8 [[T8]], 64
-; CHECK-NEXT:    [[T10:%.*]] = sub i8 0, [[T8]]
-; CHECK-NEXT:    [[T11:%.*]] = select i1 [[T9]], i8 [[T8]], i8 [[T10]]
-; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 1
-; CHECK-NEXT:    store i8 [[T11]], i8* [[T12]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T13:%.*]] = getelementptr inbounds i8, i8* [[T1]], i64 2
-; CHECK-NEXT:    [[T14:%.*]] = load i8, i8* [[T13]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T15:%.*]] = icmp ult i8 [[T14]], 64
-; CHECK-NEXT:    [[T16:%.*]] = sub i8 0, [[T14]]
-; CHECK-NEXT:    [[T17:%.*]] = select i1 [[T15]], i8 [[T14]], i8 [[T16]]
-; CHECK-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 2
-; CHECK-NEXT:    store i8 [[T17]], i8* [[T18]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T19:%.*]] = getelementptr inbounds i8, i8* [[T1]], i64 3
-; CHECK-NEXT:    [[T20:%.*]] = load i8, i8* [[T19]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T21:%.*]] = icmp ult i8 [[T20]], 64
-; CHECK-NEXT:    [[T22:%.*]] = sub i8 0, [[T20]]
-; CHECK-NEXT:    [[T23:%.*]] = select i1 [[T21]], i8 [[T20]], i8 [[T22]]
-; CHECK-NEXT:    [[T24:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 3
-; CHECK-NEXT:    store i8 [[T23]], i8* [[T24]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds i8, i8* [[T1]], i64 4
-; CHECK-NEXT:    [[T26:%.*]] = load i8, i8* [[T25]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T27:%.*]] = icmp ult i8 [[T26]], 64
-; CHECK-NEXT:    [[T28:%.*]] = sub i8 0, [[T26]]
-; CHECK-NEXT:    [[T29:%.*]] = select i1 [[T27]], i8 [[T26]], i8 [[T28]]
-; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 4
-; CHECK-NEXT:    store i8 [[T29]], i8* [[T30]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T31:%.*]] = getelementptr inbounds i8, i8* [[T1]], i64 5
-; CHECK-NEXT:    [[T32:%.*]] = load i8, i8* [[T31]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T33:%.*]] = icmp ult i8 [[T32]], 64
-; CHECK-NEXT:    [[T34:%.*]] = sub i8 0, [[T32]]
-; CHECK-NEXT:    [[T35:%.*]] = select i1 [[T33]], i8 [[T32]], i8 [[T34]]
-; CHECK-NEXT:    [[T36:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 5
-; CHECK-NEXT:    store i8 [[T35]], i8* [[T36]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T37:%.*]] = getelementptr inbounds i8, i8* [[T1]], i64 6
-; CHECK-NEXT:    [[T38:%.*]] = load i8, i8* [[T37]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T39:%.*]] = icmp ult i8 [[T38]], 64
-; CHECK-NEXT:    [[T40:%.*]] = sub i8 0, [[T38]]
-; CHECK-NEXT:    [[T41:%.*]] = select i1 [[T39]], i8 [[T38]], i8 [[T40]]
-; CHECK-NEXT:    [[T42:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 6
-; CHECK-NEXT:    store i8 [[T41]], i8* [[T42]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T43:%.*]] = getelementptr inbounds i8, i8* [[T1]], i64 7
-; CHECK-NEXT:    [[T44:%.*]] = load i8, i8* [[T43]], align 1, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[T45:%.*]] = icmp ult i8 [[T44]], 64
-; CHECK-NEXT:    [[T46:%.*]] = sub i8 0, [[T44]]
-; CHECK-NEXT:    [[T47:%.*]] = select i1 [[T45]], i8 [[T44]], i8 [[T46]]
-; CHECK-NEXT:    [[T48:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 7
-; CHECK-NEXT:    store i8 [[T47]], i8* [[T48]], align 1, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[T1:%.*]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <8 x i8> [[TMP2]], <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i8> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP3]], <8 x i8> [[TMP2]], <8 x i8> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[T0:%.*]] to <8 x i8>*
+; CHECK-NEXT:    store <8 x i8> [[TMP5]], <8 x i8>* [[TMP6]], align 1, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %t3 = load i8, i8* %t1, align 1, !tbaa !3

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
index a0d3c939d4ff2..17b9ca1bf0bd3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
@@ -13,24 +13,20 @@ define i32 @foo(i32* nocapture readonly %
diff ) #0 {
 ; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD24:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP1]], 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
-; CHECK-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX6]], align 16
-; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD3]], [[A_088]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 1
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 5
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP8]], [[TMP6]]
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
-; CHECK-NEXT:    store i32 [[ADD17]], i32* [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ADD24]] = add nsw i32 [[ADD10]], [[ADD17]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX2]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP8]], [[A_088]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX6]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP7]], <2 x i32>* [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
+; CHECK-NEXT:    [[ADD24]] = add nsw i32 [[ADD10]], [[TMP10]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
@@ -68,11 +64,13 @@ for.body:                                         ; preds = %for.body, %entry
   %add24 = add nsw i32 %add10, %add17
 
   ; YAML:      Pass:            slp-vectorizer
-  ; YAML-NEXT: Name:            NotPossible
+  ; YAML-NEXT: Name:            StoresVectorized
   ; YAML-NEXT: Function:        foo
   ; YAML-NEXT: Args:
-  ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: vectorization was impossible'
-  ; YAML-NEXT:   - String:          ' with available vectorization factors'
+  ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+  ; YAML-NEXT:   - Cost:            '-1'
+  ; YAML-NEXT:   - String:          ' and with tree size '
+  ; YAML-NEXT:   - TreeSize:        '4'
 
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
index e4b35b6f06f9e..83400fe416cba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -10,32 +10,32 @@ define  void @foo (%struct.complex* %A, %struct.complex* %B, %struct.complex* %R
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18]] = fadd float [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP19]] = fadd float [[TMP2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x float> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[TMP9]], [[TMP14]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP16:%.*]] = fsub <2 x float> [[TMP12]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[TMP12]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP19]] = fadd <2 x float> [[TMP2]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
-; CHECK-NEXT:    store float [[TMP18]], float* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
-; CHECK-NEXT:    store float [[TMP19]], float* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP22]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP19]], <2 x float>* [[TMP23]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll b/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll
index a5a621b005e0b..57f78dc26367c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll
@@ -61,15 +61,11 @@ define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[I:%.*]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 undef, [[TMP4]]
-; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 undef, [[TMP9]]
-; CHECK-NEXT:    store i32 [[TMP10]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i32> undef, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %1 = add i64 %i, 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
index bff947e28cae7..51b15ce0c577b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@@ -14,14 +14,10 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
-; CHECK-NEXT:    [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31
-; CHECK-NEXT:    [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1
-; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0) to <2 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP3]], <i32 31, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[TMP4]], <i32 1, i32 1>
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0) to <2 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
index 9349d5d95be7e..b093627d0776d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
@@ -71,38 +71,28 @@ define i32 @unrollable(i32* %in, i32* %out, i64 %n) nounwind ssp uwtable {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
-; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP26:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP18:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP4]], 7
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 7
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP7]], 7
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 14
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP10]], 7
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 21
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP13]], 7
-; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 28
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP5]]
-; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP8]], <i32 7, i32 7>
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP9]], <i32 7, i32 14>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul <2 x i32> [[TMP14]], <i32 7, i32 7>
+; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[TMP15]], <i32 21, i32 28>
+; CHECK-NEXT:    store <2 x i32> [[TMP10]], <2 x i32>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[BARRIER:%.*]] = call i32 @goo(i32 0)
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP8]]
-; CHECK-NEXT:    store i32 [[TMP19]], i32* [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP11]]
-; CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP26]] = add i64 [[I_019]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP26]], [[N]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP16]], <2 x i32>* [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18]] = add i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP18]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 undef

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
index 72eb65e45fb7c..00ba61064ca24 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -535,14 +535,35 @@ define void @sitofp_8i8_8f64() #0 {
 ;
 
 define void @sitofp_2i64_2f32() #0 {
-; CHECK-LABEL: @sitofp_2i64_2f32(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @sitofp_2i64_2f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_2i64_2f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_2i64_2f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX512-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_2i64_2f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX256DQ-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 9f31fc2f3dc19..6434b3a25e9f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -535,14 +535,35 @@ define void @sitofp_8i8_8f64() #0 {
 ;
 
 define void @sitofp_2i64_2f32() #0 {
-; CHECK-LABEL: @sitofp_2i64_2f32(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
-; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @sitofp_2i64_2f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_2i64_2f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_2i64_2f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX512-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_2i64_2f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX256DQ-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
index 69544d9e59827..89aa0c0429eda 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -472,14 +472,44 @@ define void @uitofp_8i8_8f64() #0 {
 ;
 
 define void @uitofp_2i64_2f32() #0 {
-; CHECK-LABEL: @uitofp_2i64_2f32(
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
-; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
-; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
-; CHECK-NEXT:    ret void
+; SSE-LABEL: @uitofp_2i64_2f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @uitofp_2i64_2f32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX1-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX1-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX1-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @uitofp_2i64_2f32(
+; AVX2-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX2-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX2-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX2-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX2-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX2-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_2i64_2f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX512-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i64_2f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float>
+; AVX256DQ-NEXT:    store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 7a7974bcfc961..a7d3323219953 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -40,18 +40,15 @@ define void @add1(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP5]], 3
 ; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -88,14 +85,11 @@ define void @sub0(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
-; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -195,14 +189,13 @@ define void @addsub0(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
 ; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
-; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
-; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -229,22 +222,21 @@ entry:
 define void @addsub1(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-LABEL: @addsub1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[DST]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3
 ; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -272,22 +264,19 @@ entry:
 define void @mul(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-LABEL: @mul(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP1]], <i32 257, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP5]], -9
 ; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -319,18 +308,15 @@ define void @shl0(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
-; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
-; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP5]], 3
 ; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -426,18 +412,15 @@ define void @add1f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float 1.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP5]], 3.000000e+00
 ; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -474,14 +457,11 @@ define void @sub0f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -581,14 +561,13 @@ define void @addsub0f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[INCDEC_PTR3]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP6]], <2 x float>* [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -615,22 +594,21 @@ entry:
 define void @addsub1f(float* noalias %dst, float* noalias %src) {
 ; CHECK-LABEL: @addsub1f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[DST]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[TMP6]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP7]], -3.000000e+00
 ; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -658,22 +636,19 @@ entry:
 define void @mulf(float* noalias %dst, float* noalias %src) {
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[TMP4]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00
 ; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -737,18 +712,15 @@ define void @add1fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
 ; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float 1.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP5]], 3.000000e+00
 ; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -785,14 +757,11 @@ define void @sub0fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
-; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -883,22 +852,19 @@ entry:
 define void @mulfn(float* noalias %dst, float* noalias %src) {
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
-; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[TMP4]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00
 ; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;