[llvm] ab2c499 - [SLP] Add insertelement instructions to vectorizable tree

Anton Afanasyev via llvm-commits llvm-commits at lists.llvm.org
Wed May 12 21:42:57 PDT 2021


Author: Anton Afanasyev
Date: 2021-05-13T07:41:45+03:00
New Revision: ab2c499d3a2ed3d3e13d96e456c57fb35a114b31

URL: https://github.com/llvm/llvm-project/commit/ab2c499d3a2ed3d3e13d96e456c57fb35a114b31
DIFF: https://github.com/llvm/llvm-project/commit/ab2c499d3a2ed3d3e13d96e456c57fb35a114b31.diff

LOG: [SLP] Add insertelement instructions to vectorizable tree

Add new type of tree node for `InsertElementInst` chain forming vector.
These instructions could be either removed, or replaced by shuffles during
vectorization and we can add this node to cost model, so naturally estimating
their cost, getting rid of `CompensateCost` tricks and reducing further work
for InstCombine. This fixes PR40522 and PR35732 in a natural way. Also this
patch is the first step towards revectorization of partially vectorization
(to fix PR42022 completely). After adding inserts to tree the next step is
to add vector instructions there (for instance, to merge `store <2 x float>`
and `store <2 x float>` to `store <4 x float>`).

Fixes PR40522 and PR35732.

Differential Revision: https://reviews.llvm.org/D98714

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
    llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
    llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
    llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
    llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
    llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
    llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll
    llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
    llvm/test/Transforms/SLPVectorizer/X86/phi.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
    llvm/test/Transforms/SLPVectorizer/X86/resched.ll
    llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/sext.ll
    llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
    llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
    llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/zext.ll
    llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 2c5b4998b4fa..f416a592d683 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -94,15 +94,9 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
   /// Try to vectorize a list of operands.
-  /// When \p InsertUses is provided and its entries are non-zero
-  /// then users of \p VL are known to be InsertElement instructions
-  /// each associated with same VL entry index. Their cost is then
-  /// used to adjust cost of the vectorization assuming instcombine pass
-  /// then optimizes ExtractElement-InsertElement sequence.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
-                          bool AllowReorder = false,
-                          ArrayRef<Value *> InsertUses = None);
+                          bool AllowReorder = false);
 
   /// Try to vectorize a chain that may start at the operands of \p I.
   bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 222062da13fc..99d9b05d28a3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -542,6 +542,37 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
     Mask[Indices[I]] = I;
 }
 
+/// \returns inserting index of InsertElement or InsertValue instruction,
+/// using Offset as base offset for index.
+static Optional<unsigned> getInsertIndex(Value *InsertInst, unsigned Offset) {
+  unsigned Index = Offset;
+  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+    if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+      auto *VT = cast<FixedVectorType>(IE->getType());
+      Index *= VT->getNumElements();
+      Index += CI->getZExtValue();
+      return Index;
+    }
+    return None;
+  }
+
+  auto *IV = cast<InsertValueInst>(InsertInst);
+  Type *CurrentType = IV->getType();
+  for (unsigned I : IV->indices()) {
+    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+      Index *= ST->getNumElements();
+      CurrentType = ST->getElementType(I);
+    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+      Index *= AT->getNumElements();
+      CurrentType = AT->getElementType();
+    } else {
+      return None;
+    }
+    Index += I;
+  }
+  return Index;
+}
+
 namespace slpvectorizer {
 
 /// Bottom Up SLP Vectorizer.
@@ -2203,6 +2234,7 @@ class BoUpSLP {
           auto *In = TE->getMainOp();
           assert(In &&
                  (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+                  isa<InsertElementInst>(In) ||
                   In->getNumOperands() == TE->getNumOperands()) &&
                  "Missed TreeEntry operands?");
           (void)In; // fake use to avoid build failure when assertions disabled
@@ -2636,7 +2668,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   }
 
   // Don't handle vectors.
-  if (S.OpValue->getType()->isVectorTy()) {
+  if (S.OpValue->getType()->isVectorTy() &&
+      !isa<InsertElementInst>(S.OpValue)) {
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
     newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
     return;
@@ -2848,6 +2881,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       BS.cancelScheduling(VL, VL0);
       return;
     }
+    case Instruction::InsertElement: {
+      assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
+
+      int Offset = *getInsertIndex(VL[0], 0);
+      ValueList Operands(VL.size());
+      Operands[0] = cast<Instruction>(VL[0])->getOperand(1);
+      bool IsConsecutive = true;
+      for (unsigned I = 1, E = VL.size(); I < E; ++I) {
+        IsConsecutive &= (I == *getInsertIndex(VL[I], 0) - Offset);
+        Operands[I] = cast<Instruction>(VL[I])->getOperand(1);
+      }
+
+      // FIXME: support vectorization of non-consecutive inserts
+      // using shuffle with its proper cost estimation
+      if (IsConsecutive) {
+        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);
+        LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
+
+        TE->setOperand(0, Operands);
+
+        ValueList VectorOperands;
+        for (Value *V : VL)
+          VectorOperands.push_back(cast<Instruction>(V)->getOperand(0));
+
+        TE->setOperand(1, VectorOperands);
+
+        buildTree_rec(Operands, Depth + 1, {TE, 0});
+        return;
+      }
+
+      LLVM_DEBUG(dbgs() << "SLP: skipping non-consecutive inserts.\n");
+      BS.cancelScheduling(VL, VL0);
+      buildTree_rec(Operands, Depth, UserTreeIdx);
+      return;
+    }
     case Instruction::Load: {
       // Check that a vectorized load would load the same memory as a scalar
       // load. For example, we don't want to vectorize loads that are smaller
@@ -3526,6 +3594,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
     ScalarTy = SI->getValueOperand()->getType();
   else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
     ScalarTy = CI->getOperand(0)->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+    ScalarTy = IE->getOperand(1)->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
@@ -3732,6 +3802,28 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
       }
       return CommonCost;
     }
+    case Instruction::InsertElement: {
+      auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
+
+      unsigned const NumElts = SrcVecTy->getNumElements();
+      APInt DemandedElts = APInt::getNullValue(NumElts);
+      for (auto *V : VL)
+        DemandedElts.setBit(*getInsertIndex(V, 0));
+
+      InstructionCost Cost = 0;
+      Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
+                                            /*Insert*/ true, /*Extract*/ false);
+
+      unsigned const NumScalars = VL.size();
+      unsigned const Offset = *getInsertIndex(VL[0], 0);
+      if (NumElts != NumScalars && Offset % NumScalars != 0)
+        Cost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_InsertSubvector, SrcVecTy, /*Mask*/ None,
+            Offset,
+            FixedVectorType::get(SrcVecTy->getElementType(), NumScalars));
+
+      return Cost;
+    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -4129,6 +4221,12 @@ bool BoUpSLP::isLoadCombineCandidate() const {
 }
 
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
+  // No need to vectorize inserts of gathered values.
+  if (VectorizableTree.size() == 2 &&
+      isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
+      VectorizableTree[1]->State == TreeEntry::NeedToGather)
+    return true;
+
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
   if (VectorizableTree.size() >= MinTreeSize)
@@ -4261,6 +4359,10 @@ InstructionCost BoUpSLP::getTreeCost() {
     if (EphValues.count(EU.User))
       continue;
 
+    // No extract cost for vector "scalar"
+    if (isa<FixedVectorType>(EU.Scalar->getType()))
+      continue;
+
     // If we plan to rewrite the tree in a smaller type, we will need to sign
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
@@ -4680,6 +4782,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   Type *ScalarTy = VL0->getType();
   if (auto *Store = dyn_cast<StoreInst>(VL0))
     ScalarTy = Store->getValueOperand()->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
+    ScalarTy = IE->getOperand(1)->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
@@ -4739,6 +4843,43 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       E->VectorizedValue = NewV;
       return NewV;
     }
+    case Instruction::InsertElement: {
+      Builder.SetInsertPoint(VL0);
+      Value *V = vectorizeTree(E->getOperand(0));
+
+      const unsigned NumElts =
+          cast<FixedVectorType>(VL0->getType())->getNumElements();
+      const unsigned NumScalars = E->Scalars.size();
+
+      // Create InsertVector shuffle if necessary
+      if (NumElts != NumScalars) {
+        unsigned MinIndex = *getInsertIndex(E->Scalars[0], 0);
+        Instruction *FirstInsert = nullptr;
+        for (auto *Scalar : E->Scalars)
+          if (!FirstInsert &&
+              !is_contained(E->Scalars,
+                            cast<Instruction>(Scalar)->getOperand(0)))
+            FirstInsert = cast<Instruction>(Scalar);
+
+        // Create shuffle to resize vector
+        SmallVector<int, 16> Mask(NumElts, UndefMaskElem);
+        std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask);
+
+        const unsigned MaxIndex = MinIndex + NumScalars;
+        for (unsigned I = 0; I < NumElts; I++)
+          Mask[I] =
+              (I < MinIndex || I >= MaxIndex) ? I : NumElts - MinIndex + I;
+
+        V = Builder.CreateShuffleVector(
+            FirstInsert->getOperand(0), V, Mask,
+            cast<Instruction>(E->Scalars[NumScalars - 1])->getName());
+      }
+
+      ++NumVectorInstructions;
+      E->VectorizedValue = V;
+      return V;
+    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -5164,16 +5305,6 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
                     << " values .\n");
 
-  // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
-  // specified by ScalarType.
-  auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
-    if (!MinBWs.count(ScalarRoot))
-      return Ex;
-    if (MinBWs[ScalarRoot].second)
-      return Builder.CreateSExt(Ex, ScalarType);
-    return Builder.CreateZExt(Ex, ScalarType);
-  };
-
   // Extract all of the elements with the external uses.
   for (const auto &ExternalUse : ExternalUses) {
     Value *Scalar = ExternalUse.Scalar;
@@ -5192,6 +5323,23 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
+    auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
+      if (Scalar->getType() != Vec->getType()) {
+        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+        // If necessary, sign-extend or zero-extend ScalarRoot
+        // to the larger type.
+        if (!MinBWs.count(ScalarRoot))
+          return Ex;
+        if (MinBWs[ScalarRoot].second)
+          return Builder.CreateSExt(Ex, Scalar->getType());
+        return Builder.CreateZExt(Ex, Scalar->getType());
+      } else {
+        assert(isa<FixedVectorType>(Scalar->getType()) &&
+               isa<InsertElementInst>(Scalar) &&
+               "In-tree scalar of vector type is not insertelement?");
+        return Vec;
+      }
+    };
     // If User == nullptr, the Scalar is used as extra arg. Generate
     // ExtractElement instruction and update the record for this scalar in
     // ExternallyUsedValues.
@@ -5205,14 +5353,13 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       } else {
         Builder.SetInsertPoint(&F->getEntryBlock().front());
       }
-      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
-      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      Value *NewInst = ExtractAndExtendIfNeeded(Vec);
       CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
       auto &Locs = ExternallyUsedValues[Scalar];
-      ExternallyUsedValues.insert({Ex, Locs});
+      ExternallyUsedValues.insert({NewInst, Locs});
       ExternallyUsedValues.erase(Scalar);
       // Required to update internally referenced instructions.
-      Scalar->replaceAllUsesWith(Ex);
+      Scalar->replaceAllUsesWith(NewInst);
       continue;
     }
 
@@ -5230,25 +5377,22 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
             } else {
               Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
             }
-            Value *Ex = Builder.CreateExtractElement(Vec, Lane);
-            Ex = extend(ScalarRoot, Ex, Scalar->getType());
+            Value *NewInst = ExtractAndExtendIfNeeded(Vec);
             CSEBlocks.insert(PH->getIncomingBlock(i));
-            PH->setOperand(i, Ex);
+            PH->setOperand(i, NewInst);
           }
         }
       } else {
         Builder.SetInsertPoint(cast<Instruction>(User));
-        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
-        Ex = extend(ScalarRoot, Ex, Scalar->getType());
+        Value *NewInst = ExtractAndExtendIfNeeded(Vec);
         CSEBlocks.insert(cast<Instruction>(User)->getParent());
-        User->replaceUsesOfWith(Scalar, Ex);
+        User->replaceUsesOfWith(Scalar, NewInst);
       }
     } else {
       Builder.SetInsertPoint(&F->getEntryBlock().front());
-      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
-      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      Value *NewInst = ExtractAndExtendIfNeeded(Vec);
       CSEBlocks.insert(&F->getEntryBlock());
-      User->replaceUsesOfWith(Scalar, Ex);
+      User->replaceUsesOfWith(Scalar, NewInst);
     }
 
     LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
@@ -5661,7 +5805,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
           for (User *U : BundleMember->Inst->users()) {
             if (isa<Instruction>(U)) {
               ScheduleData *UseSD = getScheduleData(U);
-              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle) &&
+                  // Ignore inner deps for insertelement
+                  !(UseSD->FirstInBundle == SD &&
+                    isa<InsertElementInst>(BundleMember->Inst))) {
                 BundleMember->Dependencies++;
                 ScheduleData *DestBundle = UseSD->FirstInBundle;
                 if (!DestBundle->IsScheduled)
@@ -5840,6 +5987,9 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
   }
 
+  if (auto *IEI = dyn_cast<InsertElementInst>(V))
+    return getVectorElementSize(IEI->getOperand(1));
+
   auto E = InstrElementSize.find(V);
   if (E != InstrElementSize.end())
     return E->second;
@@ -6487,8 +6637,7 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           bool AllowReorder,
-                                           ArrayRef<Value *> InsertUses) {
+                                           bool AllowReorder) {
   if (VL.size() < 2)
     return false;
 
@@ -6506,7 +6655,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   // determining vectorization factor for scalar instructions.
   for (Value *V : VL) {
     Type *Ty = V->getType();
-    if (!isValidElementType(Ty)) {
+    if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
       // NOTE: the following will give user internal llvm type name, which may
       // not be useful.
       R.getORE()->emit([&]() {
@@ -6537,20 +6686,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   bool Changed = false;
   bool CandidateFound = false;
   InstructionCost MinCost = SLPCostThreshold.getValue();
-
-  bool CompensateUseCost =
-      !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
-        return V && isa<InsertElementInst>(V);
-      });
-  assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
-         "Each scalar expected to have an associated InsertElement user.");
+  Type *ScalarTy = VL[0]->getType();
+  if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+    ScalarTy = IE->getOperand(1)->getType();
 
   unsigned NextInst = 0, MaxInst = VL.size();
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
-    auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
+    auto *VecTy = FixedVectorType::get(ScalarTy, VF);
     if (TTI->getNumberOfParts(VecTy) == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {
@@ -6594,46 +6739,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       R.computeMinimumValueSizes();
       InstructionCost Cost = R.getTreeCost();
       CandidateFound = true;
-      if (CompensateUseCost) {
-        // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
-        // rather than sum of single inserts as the latter may overestimate
-        // cost. This work should imply improving cost estimation for extracts
-        // that added in for external (for vectorization tree) users,i.e. that
-        // part should also switch to same interface.
-        // For example, the following case is projected code after SLP:
-        //  %4 = extractelement <4 x i64> %3, i32 0
-        //  %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
-        //  %5 = extractelement <4 x i64> %3, i32 1
-        //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
-        //  %6 = extractelement <4 x i64> %3, i32 2
-        //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
-        //  %7 = extractelement <4 x i64> %3, i32 3
-        //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
-        //
-        // Extracts here added by SLP in order to feed users (the inserts) of
-        // original scalars and contribute to "ExtractCost" at cost evaluation.
-        // The inserts in turn form sequence to build an aggregate that
-        // detected by findBuildAggregate routine.
-        // SLP makes an assumption that such sequence will be optimized away
-        // later (instcombine) so it tries to compensate ExctractCost with
-        // cost of insert sequence.
-        // Current per element cost calculation approach is not quite accurate
-        // and tends to create bias toward favoring vectorization.
-        // Switching to the TTI interface might help a bit.
-        // Alternative solution could be pattern-match to detect a no-op or
-        // shuffle.
-        InstructionCost UserCost = 0;
-        for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
-          auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
-          if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
-            UserCost += TTI->getVectorInstrCost(
-                Instruction::InsertElement, IE->getType(), CI->getZExtValue());
-        }
-        LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
-                          << ".\n");
-        Cost -= UserCost;
-      }
-
       MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
@@ -7451,36 +7556,6 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
   } while (true);
 }
 
-static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
-                                          unsigned OperandOffset) {
-  unsigned OperandIndex = OperandOffset;
-  if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
-    if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
-      auto *VT = cast<FixedVectorType>(IE->getType());
-      OperandIndex *= VT->getNumElements();
-      OperandIndex += CI->getZExtValue();
-      return OperandIndex;
-    }
-    return None;
-  }
-
-  auto *IV = cast<InsertValueInst>(InsertInst);
-  Type *CurrentType = IV->getType();
-  for (unsigned int Index : IV->indices()) {
-    if (auto *ST = dyn_cast<StructType>(CurrentType)) {
-      OperandIndex *= ST->getNumElements();
-      CurrentType = ST->getElementType(Index);
-    } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
-      OperandIndex *= AT->getNumElements();
-      CurrentType = AT->getElementType();
-    } else {
-      return None;
-    }
-    OperandIndex += Index;
-  }
-  return OperandIndex;
-}
-
 static bool findBuildAggregate_rec(Instruction *LastInsertInst,
                                    TargetTransformInfo *TTI,
                                    SmallVectorImpl<Value *> &BuildVectorOpds,
@@ -7489,7 +7564,7 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst,
   do {
     Value *InsertedOperand = LastInsertInst->getOperand(1);
     Optional<unsigned> OperandIndex =
-        getOperandIndex(LastInsertInst, OperandOffset);
+        getInsertIndex(LastInsertInst, OperandOffset);
     if (!OperandIndex)
       return false;
     if (isa<InsertElementInst>(InsertedOperand) ||
@@ -7751,8 +7826,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
-                            BuildVectorInsts);
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
@@ -7766,10 +7840,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
        isShuffle(BuildVectorOpds, Mask)))
     return false;
 
-  // Vectorize starting with the build vector operands ignoring the BuildVector
-  // instructions for the purpose of scheduling and user extraction.
-  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
-                            BuildVectorInsts);
+  LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
+  return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/false);
 }
 
 bool SLPVectorizerPass::vectorizeSimpleInstructions(

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
index 2e437ead6d2c..b3b700e03784 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -13,15 +13,7 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_sin_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -33,12 +25,13 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -64,29 +57,13 @@ define <4 x float> @ceil_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @ceil_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -112,29 +89,13 @@ define <4 x float> @fabs_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -158,29 +119,13 @@ define <4 x float> @int_fabs_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -204,29 +149,13 @@ define <4 x float> @floor_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @floor_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -250,29 +179,13 @@ define <4 x float> @sqrt_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sqrt_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -296,15 +209,7 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @exp_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -316,12 +221,13 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -345,15 +251,7 @@ define <4 x float> @expm1_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @expm1_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -394,15 +292,7 @@ define <4 x float> @log_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @log_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -414,12 +304,13 @@ define <4 x float> @log_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -443,15 +334,7 @@ define <4 x float> @log1p_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @log1p_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -544,15 +427,7 @@ define <4 x float> @logb_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @logb_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -593,15 +468,7 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sin_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -613,12 +480,13 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -642,15 +510,7 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @cos_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -662,12 +522,13 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -691,15 +552,7 @@ define <4 x float> @tan_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @tan_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -740,15 +593,7 @@ define <4 x float> @asin_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @asin_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -789,15 +634,7 @@ define <4 x float> @acos_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @acos_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -838,15 +675,7 @@ define <4 x float> @atan_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @atan_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -887,15 +716,7 @@ define <4 x float> @sinh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sinh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -936,15 +757,7 @@ define <4 x float> @cosh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @cosh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -985,15 +798,7 @@ define <4 x float> @tanh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @tanh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1034,15 +839,7 @@ define <4 x float> @asinh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @asinh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1083,15 +880,7 @@ define <4 x float> @acosh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @acosh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1132,15 +921,7 @@ define <4 x float> @atanh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @atanh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1182,10 +963,10 @@ define <2 x float> @sin_2x(<2 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) [[ATTR2:#.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) [[ATTR2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
@@ -1220,15 +1001,7 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_cos_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1240,12 +1013,13 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -1270,10 +1044,10 @@ define <2 x float> @cos_2x(<2 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) [[ATTR3:#.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) [[ATTR3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index 6614b4178b8b..791eb36de8c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -13,15 +13,7 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_sin_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -33,12 +25,13 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -64,29 +57,13 @@ define <4 x float> @ceil_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @ceil_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -112,29 +89,13 @@ define <4 x float> @fabs_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -158,29 +119,13 @@ define <4 x float> @int_fabs_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_fabs_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -204,29 +149,13 @@ define <4 x float> @floor_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @floor_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -250,29 +179,13 @@ define <4 x float> @sqrt_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sqrt_4x(
 ; NOACCELERATE-NEXT:  entry:
 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
-; NOACCELERATE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; NOACCELERATE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -296,15 +209,7 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @exp_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -316,12 +221,13 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -345,15 +251,7 @@ define <4 x float> @expm1_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @expm1_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -394,15 +292,7 @@ define <4 x float> @log_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @log_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -414,12 +304,13 @@ define <4 x float> @log_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -443,15 +334,7 @@ define <4 x float> @log1p_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @log1p_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -544,15 +427,7 @@ define <4 x float> @logb_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @logb_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -593,15 +468,7 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sin_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -613,12 +480,13 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -642,15 +510,7 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @cos_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -662,12 +522,13 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @cosf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -691,15 +552,7 @@ define <4 x float> @tan_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @tan_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -740,15 +593,7 @@ define <4 x float> @asin_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @asin_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -789,15 +634,7 @@ define <4 x float> @acos_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @acos_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -838,15 +675,7 @@ define <4 x float> @atan_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @atan_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -887,15 +716,7 @@ define <4 x float> @sinh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @sinh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -936,15 +757,7 @@ define <4 x float> @cosh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @cosh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -985,15 +798,7 @@ define <4 x float> @tanh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @tanh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1034,15 +839,7 @@ define <4 x float> @asinh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @asinh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1083,15 +880,7 @@ define <4 x float> @acosh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @acosh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1132,15 +921,7 @@ define <4 x float> @atanh_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @atanh_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1182,10 +963,10 @@ define <2 x float> @sin_2x(<2 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #2
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #2
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;
@@ -1220,15 +1001,7 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; NOACCELERATE-LABEL: @int_cos_4x(
 ; NOACCELERATE-NEXT:  entry:
@@ -1240,12 +1013,13 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
-; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
-; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
+; NOACCELERATE-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
+; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
+; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16
@@ -1270,10 +1044,10 @@ define <2 x float> @cos_2x(<2 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #3
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #3
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 556a1187e10e..afd8a981ea69 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -29,40 +29,24 @@ define void @PR28330(i32 %n) {
 ; GATHER:       for.body:
 ; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
-; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
-; GATHER-NEXT:    [[TMP4:%.*]] = insertelement <8 x i1> poison, i1 [[TMP3]], i32 0
-; GATHER-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i1> [[TMP4]], i1 [[TMP5]], i32 1
-; GATHER-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
-; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i1> [[TMP6]], i1 [[TMP7]], i32 2
-; GATHER-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
-; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i1> [[TMP8]], i1 [[TMP9]], i32 3
-; GATHER-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
-; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP11]], i32 4
-; GATHER-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
-; GATHER-NEXT:    [[TMP14:%.*]] = insertelement <8 x i1> [[TMP12]], i1 [[TMP13]], i32 5
-; GATHER-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
-; GATHER-NEXT:    [[TMP16:%.*]] = insertelement <8 x i1> [[TMP14]], i1 [[TMP15]], i32 6
-; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP16]], i1 [[TMP2]], i32 7
-; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
-; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
-; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
-; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
-; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
-; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
-; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0
-; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
-; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
-; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
-; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
-; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
-; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
-; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
-; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
-; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]]
+; GATHER-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; GATHER-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; GATHER-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; GATHER-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; GATHER-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP18]], [[P17]]
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR28330(
@@ -165,40 +149,24 @@ define void @PR32038(i32 %n) {
 ; GATHER:       for.body:
 ; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
-; GATHER-NEXT:    [[TMP3:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
-; GATHER-NEXT:    [[TMP4:%.*]] = insertelement <8 x i1> poison, i1 [[TMP3]], i32 0
-; GATHER-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
-; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i1> [[TMP4]], i1 [[TMP5]], i32 1
-; GATHER-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
-; GATHER-NEXT:    [[TMP8:%.*]] = insertelement <8 x i1> [[TMP6]], i1 [[TMP7]], i32 2
-; GATHER-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
-; GATHER-NEXT:    [[TMP10:%.*]] = insertelement <8 x i1> [[TMP8]], i1 [[TMP9]], i32 3
-; GATHER-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
-; GATHER-NEXT:    [[TMP12:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP11]], i32 4
-; GATHER-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
-; GATHER-NEXT:    [[TMP14:%.*]] = insertelement <8 x i1> [[TMP12]], i1 [[TMP13]], i32 5
-; GATHER-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
-; GATHER-NEXT:    [[TMP16:%.*]] = insertelement <8 x i1> [[TMP14]], i1 [[TMP15]], i32 6
-; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP16]], i1 [[TMP2]], i32 7
-; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
-; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
-; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
-; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
-; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
-; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
-; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
-; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
-; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0
-; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
-; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
-; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
-; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
-; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
-; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
-; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
-; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
-; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
-; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], -5
+; GATHER-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; GATHER-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; GATHER-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; GATHER-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; GATHER-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP18]], -5
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
 ; GATHER-NEXT:    br label [[FOR_BODY]]
 ;
 ; MAX-COST-LABEL: @PR32038(
@@ -220,16 +188,16 @@ define void @PR32038(i32 %n) {
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; MAX-COST-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> poison, <4 x i1> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; MAX-COST-NEXT:    [[TMP4:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[P5]], i32 2
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP4]], i1 [[P7]], i32 3
+; MAX-COST-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; MAX-COST-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
 ; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
 ; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
 ; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
index 658a487485eb..c6ceb1cc7e79 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll
@@ -7,11 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x float> @insertelement-fixed-vector() {
 ; CHECK-LABEL: @insertelement-fixed-vector(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x float> [[I1]]
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
 ;
   %f0 = tail call fast float @llvm.fabs.f32(float undef)
   %f1 = tail call fast float @llvm.fabs.f32(float undef)

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
index dfa2d52ed4c8..12031a679e83 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
@@ -7,11 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x float> @insertelement-fixed-vector() {
 ; CHECK-LABEL: @insertelement-fixed-vector(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x float> [[I1]]
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
 ;
   %f0 = tail call fast float @llvm.fabs.f32(float undef)
   %f1 = tail call fast float @llvm.fabs.f32(float undef)

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
index 2f09d182f69c..8a903b406fc6 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -123,8 +123,8 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -152,15 +152,15 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0
-; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
-; CHECK-NEXT:    [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP2_11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2_32:%.*]] = shufflevector <4 x i32> [[TMP2_11]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2_32]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -198,11 +198,11 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0
 ; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 170a7a16bbb7..d0147a3328fe 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -123,8 +123,8 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -152,15 +152,15 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
-; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
-; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
-; CHECK-NEXT:    [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP2_11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2_32:%.*]] = shufflevector <4 x i32> [[TMP2_11]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2_32]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -198,11 +198,11 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0
 ; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0
   %v0.1 = extractelement <2 x i32> %v0, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
index 9b2ee74e39ca..756676d7ee14 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll
@@ -12,18 +12,17 @@ define void @noop_extracts_first_2_lanes(<2 x double>* %ptr.1, <4 x double>* %pt
 ; CHECK-LABEL: @noop_extracts_first_2_lanes(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8
-; CHECK-NEXT:    [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
-; CHECK-NEXT:    [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1
 ; CHECK-NEXT:    [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
 ; CHECK-NEXT:    [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
 ; CHECK-NEXT:    [[V2_LANE_3:%.*]] = extractelement <4 x double> [[V_2]], i32 3
-; CHECK-NEXT:    [[A_LANE_0:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_1:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_3]]
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <2 x double> undef, double [[A_LANE_0]], i32 0
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <2 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
-; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
-; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
-; CHECK-NEXT:    store <2 x double> [[A_INS_1]], <2 x double>* [[PTR_1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_3]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0
+; CHECK-NEXT:    call void @use(double [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1
+; CHECK-NEXT:    call void @use(double [[TMP4]])
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -58,13 +57,14 @@ define void @extracts_first_2_lanes_
diff erent_vectors(<2 x double>* %ptr.1, <4 x
 ; CHECK-NEXT:    [[V3_LANE_1:%.*]] = extractelement <2 x double> [[V_3]], i32 1
 ; CHECK-NEXT:    [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
 ; CHECK-NEXT:    [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
-; CHECK-NEXT:    [[A_LANE_0:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_1:%.*]] = fmul double [[V3_LANE_1]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <2 x double> undef, double [[A_LANE_0]], i32 0
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <2 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V3_LANE_1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
 ; CHECK-NEXT:    call void @use(double [[V3_LANE_1]])
-; CHECK-NEXT:    store <2 x double> [[A_INS_1]], <2 x double>* [[PTR_1]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -104,13 +104,11 @@ define void @noop_extract_second_2_lanes(<4 x double>* %ptr.1, <4 x double>* %pt
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <4 x double> undef, double [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <4 x double> [[A_INS_0]], double [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[A_INS_11:%.*]] = shufflevector <4 x double> undef, <4 x double> [[TMP5]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_3]])
-; CHECK-NEXT:    store <4 x double> [[A_INS_1]], <4 x double>* [[PTR_1]], align 8
+; CHECK-NEXT:    store <4 x double> [[A_INS_11]], <4 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -139,17 +137,17 @@ define void @extract_reverse_order(<2 x double>* %ptr.1, <4 x double>* %ptr.2) {
 ; CHECK-LABEL: @extract_reverse_order(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8
-; CHECK-NEXT:    [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
-; CHECK-NEXT:    [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V_1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
 ; CHECK-NEXT:    [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
-; CHECK-NEXT:    [[A_LANE_0:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_1:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <2 x double> undef, double [[A_LANE_0]], i32 0
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <2 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
-; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
-; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
-; CHECK-NEXT:    store <2 x double> [[A_INS_1]], <2 x double>* [[PTR_1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[SHUFFLE]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    call void @use(double [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    call void @use(double [[TMP4]])
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -182,13 +180,16 @@ define void @extract_lanes_1_and_2(<4 x double>* %ptr.1, <4 x double>* %ptr.2) {
 ; CHECK-NEXT:    [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2
 ; CHECK-NEXT:    [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
 ; CHECK-NEXT:    [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
-; CHECK-NEXT:    [[A_LANE_0:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_1:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <4 x double> undef, double [[A_LANE_0]], i32 0
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <4 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[A_INS_11:%.*]] = shufflevector <4 x double> undef, <4 x double> [[TMP5]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
-; CHECK-NEXT:    store <4 x double> [[A_INS_1]], <4 x double>* [[PTR_1]], align 8
+; CHECK-NEXT:    store <4 x double> [[A_INS_11]], <4 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -227,24 +228,21 @@ define void @noop_extracts_existing_vector_4_lanes(<9 x double>* %ptr.1, <4 x do
 ; CHECK-NEXT:    [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
 ; CHECK-NEXT:    [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
 ; CHECK-NEXT:    [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[A_LANE_2:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_3:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP6]], i32 1
-; CHECK-NEXT:    [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2
-; CHECK-NEXT:    [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_1]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[V2_LANE_0]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP3]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> undef, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[A_INS_31:%.*]] = shufflevector <9 x double> undef, <9 x double> [[TMP7]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_0]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_1]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_2]])
 ; CHECK-NEXT:    call void @use(double [[V1_LANE_3]])
-; CHECK-NEXT:    store <9 x double> [[A_INS_3]], <9 x double>* [[PTR_1]], align 8
+; CHECK-NEXT:    store <9 x double> [[A_INS_31]], <9 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 bb:
@@ -367,58 +365,30 @@ define void @noop_extracts_9_lanes(<9 x double>* %ptr.1, <4 x double>* %ptr.2) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V2_LANE_1]], i32 7
 ; CHECK-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP7]], [[TMP15]]
 ; CHECK-NEXT:    [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x double> [[TMP16]], i32 0
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x double> [[TMP16]], i32 1
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x double> [[TMP16]], i32 2
-; CHECK-NEXT:    [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x double> [[TMP16]], i32 3
-; CHECK-NEXT:    [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x double> [[TMP16]], i32 4
-; CHECK-NEXT:    [[A_INS_4:%.*]] = insertelement <9 x double> [[A_INS_3]], double [[TMP21]], i32 4
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x double> [[TMP16]], i32 5
-; CHECK-NEXT:    [[A_INS_5:%.*]] = insertelement <9 x double> [[A_INS_4]], double [[TMP22]], i32 5
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x double> [[TMP16]], i32 6
-; CHECK-NEXT:    [[A_INS_6:%.*]] = insertelement <9 x double> [[A_INS_5]], double [[TMP23]], i32 6
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x double> [[TMP16]], i32 7
-; CHECK-NEXT:    [[A_INS_7:%.*]] = insertelement <9 x double> [[A_INS_6]], double [[TMP24]], i32 7
-; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_7]], double [[A_LANE_8]], i32 8
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[V1_LANE_7]], i32 1
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x double> [[TMP26]], double [[V1_LANE_8]], i32 2
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x double> [[TMP27]], double [[V1_LANE_0]], i32 3
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> [[TMP28]], double [[V1_LANE_1]], i32 4
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x double> [[TMP29]], double [[V1_LANE_2]], i32 5
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP30]], double [[V1_LANE_3]], i32 6
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[V1_LANE_4]], i32 7
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x double> [[TMP33]], double [[V2_LANE_1]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <8 x double> [[TMP34]], double [[V2_LANE_0]], i32 2
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <8 x double> [[TMP35]], double [[V2_LANE_2]], i32 3
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <8 x double> [[TMP36]], double [[V2_LANE_1]], i32 4
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <8 x double> [[TMP37]], double [[V2_LANE_0]], i32 5
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <8 x double> [[TMP38]], double [[V2_LANE_2]], i32 6
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <8 x double> [[TMP39]], double [[V2_LANE_1]], i32 7
-; CHECK-NEXT:    [[TMP41:%.*]] = fmul <8 x double> [[TMP32]], [[TMP40]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> undef, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+; CHECK-NEXT:    [[A_INS_72:%.*]] = shufflevector <9 x double> undef, <9 x double> [[TMP17]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_7]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_8]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x double> [[TMP20]], double [[V1_LANE_0]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V1_LANE_1]], i32 4
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V1_LANE_2]], i32 5
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <8 x double> [[TMP23]], double [[V1_LANE_3]], i32 6
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP24]], double [[V1_LANE_4]], i32 7
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x double> [[TMP26]], double [[V2_LANE_1]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x double> [[TMP27]], double [[V2_LANE_0]], i32 2
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> [[TMP28]], double [[V2_LANE_2]], i32 3
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x double> [[TMP29]], double [[V2_LANE_1]], i32 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP30]], double [[V2_LANE_0]], i32 5
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[V2_LANE_2]], i32 6
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[V2_LANE_1]], i32 7
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP25]], [[TMP33]]
 ; CHECK-NEXT:    [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <8 x double> [[TMP41]], i32 0
-; CHECK-NEXT:    [[B_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP42]], i32 0
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <8 x double> [[TMP41]], i32 1
-; CHECK-NEXT:    [[B_INS_1:%.*]] = insertelement <9 x double> [[B_INS_0]], double [[TMP43]], i32 1
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x double> [[TMP41]], i32 2
-; CHECK-NEXT:    [[B_INS_2:%.*]] = insertelement <9 x double> [[B_INS_1]], double [[TMP44]], i32 2
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <8 x double> [[TMP41]], i32 3
-; CHECK-NEXT:    [[B_INS_3:%.*]] = insertelement <9 x double> [[B_INS_2]], double [[TMP45]], i32 3
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <8 x double> [[TMP41]], i32 4
-; CHECK-NEXT:    [[B_INS_4:%.*]] = insertelement <9 x double> [[B_INS_3]], double [[TMP46]], i32 4
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <8 x double> [[TMP41]], i32 5
-; CHECK-NEXT:    [[B_INS_5:%.*]] = insertelement <9 x double> [[B_INS_4]], double [[TMP47]], i32 5
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <8 x double> [[TMP41]], i32 6
-; CHECK-NEXT:    [[B_INS_6:%.*]] = insertelement <9 x double> [[B_INS_5]], double [[TMP48]], i32 6
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <8 x double> [[TMP41]], i32 7
-; CHECK-NEXT:    [[B_INS_7:%.*]] = insertelement <9 x double> [[B_INS_6]], double [[TMP49]], i32 7
-; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_7]], double [[B_LANE_8]], i32 8
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> undef, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+; CHECK-NEXT:    [[B_INS_71:%.*]] = shufflevector <9 x double> undef, <9 x double> [[TMP35]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
 ; CHECK-NEXT:    store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
@@ -504,32 +474,14 @@ define void @first_mul_chain_jumbled(<9 x double>* %ptr.1, <4 x double>* %ptr.2)
 ; CHECK-NEXT:    [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
 ; CHECK-NEXT:    [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
 ; CHECK-NEXT:    [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
-; CHECK-NEXT:    [[A_LANE_0:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_1]]
-; CHECK-NEXT:    [[A_LANE_1:%.*]] = fmul double [[V1_LANE_3]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[A_LANE_2:%.*]] = fmul double [[V1_LANE_6]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_3:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[A_LANE_4:%.*]] = fmul double [[V1_LANE_8]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_5:%.*]] = fmul double [[V1_LANE_7]], [[V2_LANE_1]]
-; CHECK-NEXT:    [[A_LANE_6:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[A_LANE_7:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[A_LANE_0]], i32 0
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
-; CHECK-NEXT:    [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2
-; CHECK-NEXT:    [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3
-; CHECK-NEXT:    [[A_INS_4:%.*]] = insertelement <9 x double> [[A_INS_3]], double [[A_LANE_4]], i32 4
-; CHECK-NEXT:    [[A_INS_5:%.*]] = insertelement <9 x double> [[A_INS_4]], double [[A_LANE_5]], i32 5
-; CHECK-NEXT:    [[A_INS_6:%.*]] = insertelement <9 x double> [[A_INS_5]], double [[A_LANE_6]], i32 6
-; CHECK-NEXT:    [[A_INS_7:%.*]] = insertelement <9 x double> [[A_INS_6]], double [[A_LANE_7]], i32 7
-; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_7]], double [[A_LANE_8]], i32 8
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_7]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_8]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_1]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_2]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_3]], i32 6
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_4]], i32 7
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_6]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_5]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_1]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_0]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_2]], i32 2
@@ -539,24 +491,23 @@ define void @first_mul_chain_jumbled(<9 x double>* %ptr.1, <4 x double>* %ptr.2)
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V2_LANE_0]], i32 6
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V2_LANE_2]], i32 7
 ; CHECK-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP7]], [[TMP15]]
+; CHECK-NEXT:    [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> undef, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+; CHECK-NEXT:    [[A_INS_72:%.*]] = shufflevector <9 x double> undef, <9 x double> [[TMP17]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_7]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_8]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x double> [[TMP20]], double [[V1_LANE_0]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V1_LANE_1]], i32 4
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V1_LANE_2]], i32 5
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <8 x double> [[TMP23]], double [[V1_LANE_3]], i32 6
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP24]], double [[V1_LANE_4]], i32 7
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <8 x double> [[TMP25]], [[TMP15]]
 ; CHECK-NEXT:    [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x double> [[TMP16]], i32 0
-; CHECK-NEXT:    [[B_INS_0:%.*]] = insertelement <9 x double> undef, double [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x double> [[TMP16]], i32 1
-; CHECK-NEXT:    [[B_INS_1:%.*]] = insertelement <9 x double> [[B_INS_0]], double [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x double> [[TMP16]], i32 2
-; CHECK-NEXT:    [[B_INS_2:%.*]] = insertelement <9 x double> [[B_INS_1]], double [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x double> [[TMP16]], i32 3
-; CHECK-NEXT:    [[B_INS_3:%.*]] = insertelement <9 x double> [[B_INS_2]], double [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x double> [[TMP16]], i32 4
-; CHECK-NEXT:    [[B_INS_4:%.*]] = insertelement <9 x double> [[B_INS_3]], double [[TMP21]], i32 4
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x double> [[TMP16]], i32 5
-; CHECK-NEXT:    [[B_INS_5:%.*]] = insertelement <9 x double> [[B_INS_4]], double [[TMP22]], i32 5
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x double> [[TMP16]], i32 6
-; CHECK-NEXT:    [[B_INS_6:%.*]] = insertelement <9 x double> [[B_INS_5]], double [[TMP23]], i32 6
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x double> [[TMP16]], i32 7
-; CHECK-NEXT:    [[B_INS_7:%.*]] = insertelement <9 x double> [[B_INS_6]], double [[TMP24]], i32 7
-; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_7]], double [[B_LANE_8]], i32 8
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> undef, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+; CHECK-NEXT:    [[B_INS_71:%.*]] = shufflevector <9 x double> undef, <9 x double> [[TMP27]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
 ; CHECK-NEXT:    store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void
@@ -642,42 +593,48 @@ define void @first_and_second_mul_chain_jumbled(<9 x double>* %ptr.1, <4 x doubl
 ; CHECK-NEXT:    [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
 ; CHECK-NEXT:    [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
 ; CHECK-NEXT:    [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
-; CHECK-NEXT:    [[A_LANE_0:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[A_LANE_1:%.*]] = fmul double [[V1_LANE_3]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_2:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_1]]
-; CHECK-NEXT:    [[A_LANE_3:%.*]] = fmul double [[V1_LANE_6]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_4:%.*]] = fmul double [[V1_LANE_8]], [[V2_LANE_1]]
-; CHECK-NEXT:    [[A_LANE_5:%.*]] = fmul double [[V1_LANE_7]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[A_LANE_6:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[A_LANE_7:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_5]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_6]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_0]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_2]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_1]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[V2_LANE_2]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[V2_LANE_1]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[V2_LANE_0]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V2_LANE_2]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V2_LANE_1]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP7]], [[TMP15]]
 ; CHECK-NEXT:    [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[A_LANE_0]], i32 0
-; CHECK-NEXT:    [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
-; CHECK-NEXT:    [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2
-; CHECK-NEXT:    [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3
-; CHECK-NEXT:    [[A_INS_4:%.*]] = insertelement <9 x double> [[A_INS_3]], double [[A_LANE_4]], i32 4
-; CHECK-NEXT:    [[A_INS_5:%.*]] = insertelement <9 x double> [[A_INS_4]], double [[A_LANE_5]], i32 5
-; CHECK-NEXT:    [[A_INS_6:%.*]] = insertelement <9 x double> [[A_INS_5]], double [[A_LANE_6]], i32 6
-; CHECK-NEXT:    [[A_INS_7:%.*]] = insertelement <9 x double> [[A_INS_6]], double [[A_LANE_7]], i32 7
-; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_7]], double [[A_LANE_8]], i32 8
-; CHECK-NEXT:    [[B_LANE_0:%.*]] = fmul double [[V1_LANE_7]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[B_LANE_1:%.*]] = fmul double [[V1_LANE_6]], [[V2_LANE_1]]
-; CHECK-NEXT:    [[B_LANE_2:%.*]] = fmul double [[V1_LANE_8]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[B_LANE_3:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[B_LANE_4:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_0]]
-; CHECK-NEXT:    [[B_LANE_5:%.*]] = fmul double [[V1_LANE_3]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[B_LANE_6:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
-; CHECK-NEXT:    [[B_LANE_7:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> undef, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+; CHECK-NEXT:    [[A_INS_72:%.*]] = shufflevector <9 x double> undef, <9 x double> [[TMP17]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_7]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_6]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_8]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x double> [[TMP20]], double [[V1_LANE_1]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V1_LANE_0]], i32 4
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V1_LANE_3]], i32 5
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <8 x double> [[TMP23]], double [[V1_LANE_2]], i32 6
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP24]], double [[V1_LANE_5]], i32 7
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x double> [[TMP26]], double [[V2_LANE_1]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x double> [[TMP27]], double [[V2_LANE_0]], i32 2
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> [[TMP28]], double [[V2_LANE_2]], i32 3
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x double> [[TMP29]], double [[V2_LANE_0]], i32 4
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP30]], double [[V2_LANE_2]], i32 5
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[V2_LANE_1]], i32 6
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[V2_LANE_0]], i32 7
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP25]], [[TMP33]]
 ; CHECK-NEXT:    [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]]
-; CHECK-NEXT:    [[B_INS_0:%.*]] = insertelement <9 x double> undef, double [[B_LANE_0]], i32 0
-; CHECK-NEXT:    [[B_INS_1:%.*]] = insertelement <9 x double> [[B_INS_0]], double [[B_LANE_1]], i32 1
-; CHECK-NEXT:    [[B_INS_2:%.*]] = insertelement <9 x double> [[B_INS_1]], double [[B_LANE_2]], i32 2
-; CHECK-NEXT:    [[B_INS_3:%.*]] = insertelement <9 x double> [[B_INS_2]], double [[B_LANE_3]], i32 3
-; CHECK-NEXT:    [[B_INS_4:%.*]] = insertelement <9 x double> [[B_INS_3]], double [[B_LANE_4]], i32 4
-; CHECK-NEXT:    [[B_INS_5:%.*]] = insertelement <9 x double> [[B_INS_4]], double [[B_LANE_5]], i32 5
-; CHECK-NEXT:    [[B_INS_6:%.*]] = insertelement <9 x double> [[B_INS_5]], double [[B_LANE_6]], i32 6
-; CHECK-NEXT:    [[B_INS_7:%.*]] = insertelement <9 x double> [[B_INS_6]], double [[B_LANE_7]], i32 7
-; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_7]], double [[B_LANE_8]], i32 8
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <8 x double> [[TMP34]], <8 x double> undef, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
+; CHECK-NEXT:    [[B_INS_71:%.*]] = shufflevector <9 x double> undef, <9 x double> [[TMP35]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
+; CHECK-NEXT:    [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
 ; CHECK-NEXT:    store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index b9dabe966bcd..3db7c3940937 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -248,11 +248,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[TMP3]], i64 0
-; GFX8-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    [[INS_11:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_11]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
 bb:
@@ -300,8 +297,8 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
 ; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[INS_32:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_32]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index c7985b90f012..4ad7654b6009 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -248,11 +248,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP3]], i64 0
-; GFX8-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    [[INS_11:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 undef>
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_11]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
 bb:
@@ -300,8 +297,8 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> undef, <2 x i32> <i32 2, i32 3>
 ; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[INS_32:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_32]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll
index 6f8ab9248ca3..670bd443493b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap-inseltpoison.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
@@ -16,11 +17,7 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %arg) {
 ; GFX8-LABEL: @bswap_v2i16(
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[ARG:%.*]])
-; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0
-; GFX8-NEXT:    [[T2:%.*]] = insertelement <2 x i16> poison, i16 [[TMP1]], i64 0
-; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1
-; GFX8-NEXT:    [[T5:%.*]] = insertelement <2 x i16> [[T2]], i16 [[TMP2]], i64 1
-; GFX8-NEXT:    ret <2 x i16> [[T5]]
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
 bb:
   %t = extractelement <2 x i16> %arg, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll
index 5541e0295637..fc906156ca40 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/bswap.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
@@ -16,11 +17,7 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %arg) {
 ; GFX8-LABEL: @bswap_v2i16(
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[ARG:%.*]])
-; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0
-; GFX8-NEXT:    [[T2:%.*]] = insertelement <2 x i16> undef, i16 [[TMP1]], i64 0
-; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1
-; GFX8-NEXT:    [[T5:%.*]] = insertelement <2 x i16> [[T2]], i16 [[TMP2]], i64 1
-; GFX8-NEXT:    ret <2 x i16> [[T5]]
+; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
 bb:
   %t = extractelement <2 x i16> %arg, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
index 3a2553066f9b..cdafdcb38cc7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
@@ -13,11 +13,7 @@ define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[ARG1_1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> [[TMP2]], i16 [[ARG1_2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP1]], <2 x i16> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
-; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[TMP6]], i64 1
-; CHECK-NEXT:    ret <2 x i16> [[INS_2]]
+; CHECK-NEXT:    ret <2 x i16> [[TMP4]]
 ;
 bb:
   %arg0.1 = extractelement <9 x i16> undef, i64 7

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll
index 51ff7a568b77..e512948284d5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round-inseltpoison.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
@@ -16,11 +17,7 @@ define <2 x half> @round_v2f16(<2 x half> %arg) {
 ; GFX8-LABEL: @round_v2f16(
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x half> @llvm.round.v2f16(<2 x half> [[ARG:%.*]])
-; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <2 x half> [[TMP0]], i32 0
-; GFX8-NEXT:    [[T2:%.*]] = insertelement <2 x half> poison, half [[TMP1]], i64 0
-; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <2 x half> [[TMP0]], i32 1
-; GFX8-NEXT:    [[T5:%.*]] = insertelement <2 x half> [[T2]], half [[TMP2]], i64 1
-; GFX8-NEXT:    ret <2 x half> [[T5]]
+; GFX8-NEXT:    ret <2 x half> [[TMP0]]
 ;
 bb:
   %t = extractelement <2 x half> %arg, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
index 045abffa000f..b79591f285c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
@@ -16,11 +17,7 @@ define <2 x half> @round_v2f16(<2 x half> %arg) {
 ; GFX8-LABEL: @round_v2f16(
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x half> @llvm.round.v2f16(<2 x half> [[ARG:%.*]])
-; GFX8-NEXT:    [[TMP1:%.*]] = extractelement <2 x half> [[TMP0]], i32 0
-; GFX8-NEXT:    [[T2:%.*]] = insertelement <2 x half> undef, half [[TMP1]], i64 0
-; GFX8-NEXT:    [[TMP2:%.*]] = extractelement <2 x half> [[TMP0]], i32 1
-; GFX8-NEXT:    [[T5:%.*]] = insertelement <2 x half> [[T2]], half [[TMP2]], i64 1
-; GFX8-NEXT:    ret <2 x half> [[T5]]
+; GFX8-NEXT:    ret <2 x half> [[TMP0]]
 ;
 bb:
   %t = extractelement <2 x half> %arg, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll
index 9d3102566258..4918930505f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert-inseltpoison.ll
@@ -4,15 +4,7 @@
 define <4 x i32> @PR13837(<4 x float> %in) {
 ; CHECK-LABEL: @PR13837(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[IN:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %t0 = extractelement <4 x float> %in, i64 0
   %t1 = extractelement <4 x float> %in, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll
index 5998801705bb..0aa17969766d 100644
--- a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll
@@ -4,15 +4,7 @@
 define <4 x i32> @PR13837(<4 x float> %in) {
 ; CHECK-LABEL: @PR13837(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[IN:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %t0 = extractelement <4 x float> %in, i64 0
   %t1 = extractelement <4 x float> %in, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll
index a7c0660195cf..d1ed6bddf709 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865-inseltpoison.ll
@@ -10,10 +10,8 @@ define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[VECINS_I_5_I1:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
index 39a4123ff816..570157ae20bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
@@ -10,10 +10,8 @@ define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> [[TMP2]], half [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x half> [[TMP3]] to <2 x float>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[VECINS_I_5_I1:%.*]] = shufflevector <8 x i32> undef, <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 2da4a144559b..6109767563f0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -7,7 +7,7 @@ define void @Test(i32) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
@@ -45,11 +45,9 @@ define void @Test(i32) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
index 6654dbc47bd5..6233845e2a6e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -8,49 +8,30 @@
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R7]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R72]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
 ; SLM-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
 ; SLM-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <8 x float> [[R7]]
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
 ; AVX-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x float> [[R7]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; AVX512-LABEL: @sitofp_uitofp(
 ; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x float> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -81,91 +62,54 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
 ; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SLM-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SLM-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SLM-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SLM-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SLM-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SLM-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
 ; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
-; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[R7]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
+; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x i32> [[R72]]
 ;
 ; AVX512-LABEL: @fptosi_fptoui(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -250,8 +194,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; CHECK-LABEL: @sext_zext(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -281,29 +225,43 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 }
 
 define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK-LABEL: @sitofp_4i32_8i16(
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; CHECK-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
-; CHECK-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
-; CHECK-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; SSE-LABEL: @sitofp_4i32_8i16(
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SSE-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
+; SSE-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
+; SSE-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
+; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_4i32_8i16(
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SLM-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R72]]
+;
+; AVX-LABEL: @sitofp_4i32_8i16(
+; AVX-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x float> [[R72]]
+;
+; AVX512-LABEL: @sitofp_4i32_8i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; AVX512-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    ret <8 x float> [[R72]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -334,109 +292,24 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
 
 ; Inspired by PR38154
 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
-; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R7]]
-;
-; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; SLM-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; SLM-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; SLM-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
-; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; SLM-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; SLM-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SLM-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
-; AVX-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; AVX-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; AVX-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; AVX-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; AVX-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; AVX-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; AVX-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
-; AVX-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
-; AVX-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; AVX-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; AVX-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; AVX-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; AVX-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; AVX512-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; AVX512-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; AVX512-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; AVX512-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
-; AVX512-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; AVX512-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; AVX512-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; AVX512-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; AVX512-NEXT:    ret <8 x float> [[R7]]
+; CHECK-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; CHECK-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; CHECK-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; CHECK-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; CHECK-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index d2abbeb1a6fa..1c19a9a89467 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -8,49 +8,30 @@
 
 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-LABEL: @sitofp_uitofp(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R7]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R72]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
 ; SLM-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
 ; SLM-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <8 x float> [[R7]]
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
 ; AVX-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
 ; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:    ret <8 x float> [[R7]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
 ; AVX512-LABEL: @sitofp_uitofp(
 ; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
 ; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x float> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -81,91 +62,54 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 
 define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE-LABEL: @fptosi_fptoui(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
 ; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
 ; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A:%.*]], i32 4
 ; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
 ; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
 ; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; SLM-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; SLM-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; SLM-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
 ; SLM-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
 ; SLM-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
 ; SLM-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R31:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R31]], i32 [[AB4]], i32 4
 ; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
-; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
-; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
-; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
-; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
-; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
-; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
-; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[R7]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
+; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x i32> [[R72]]
 ;
 ; AVX512-LABEL: @fptosi_fptoui(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
 ; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -250,8 +194,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; CHECK-LABEL: @sext_zext(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -281,29 +225,43 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 }
 
 define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK-LABEL: @sitofp_4i32_8i16(
-; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; CHECK-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
-; CHECK-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
-; CHECK-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; SSE-LABEL: @sitofp_4i32_8i16(
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
+; SSE-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SSE-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
+; SSE-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
+; SSE-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
+; SSE-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_4i32_8i16(
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SLM-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R72]]
+;
+; AVX-LABEL: @sitofp_4i32_8i16(
+; AVX-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; AVX-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x float> [[R72]]
+;
+; AVX512-LABEL: @sitofp_4i32_8i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; AVX512-NEXT:    [[R72:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    ret <8 x float> [[R72]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -334,109 +292,24 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
 
 ; Inspired by PR38154
 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
-; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
-; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
-; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
-; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R7]]
-;
-; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; SLM-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; SLM-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; SLM-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
-; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; SLM-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; SLM-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SLM-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
-; AVX-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; AVX-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; AVX-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; AVX-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; AVX-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; AVX-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; AVX-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
-; AVX-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
-; AVX-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; AVX-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; AVX-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; AVX-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; AVX-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; AVX512-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
-; AVX512-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
-; AVX512-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
-; AVX512-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
-; AVX512-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
-; AVX512-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
-; AVX512-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
-; AVX512-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; AVX512-NEXT:    ret <8 x float> [[R7]]
+; CHECK-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; CHECK-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; CHECK-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; CHECK-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; CHECK-NEXT:    [[R31:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R31]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
index 26058f126c1f..d960aaa65298 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -10,8 +10,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @fadd_fsub_v8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -49,58 +49,11 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R7]]
-;
-; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3
-; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4
-; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5
-; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6
-; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], [[B0]]
-; SLM-NEXT:    [[AB1:%.*]] = fdiv float [[A1]], [[B1]]
-; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], [[B2]]
-; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], [[B3]]
-; SLM-NEXT:    [[AB4:%.*]] = fmul float [[A4]], [[B4]]
-; SLM-NEXT:    [[AB5:%.*]] = fdiv float [[A5]], [[B5]]
-; SLM-NEXT:    [[AB6:%.*]] = fdiv float [[A6]], [[B6]]
-; SLM-NEXT:    [[AB7:%.*]] = fmul float [[A7]], [[B7]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX512-LABEL: @fmul_fdiv_v8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[R7]]
+; CHECK-LABEL: @fmul_fdiv_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -148,11 +101,8 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 1.000000e+00>
 ; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
-; SLM-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP4]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
+; SLM-NEXT:    [[R11:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R11]], float [[A2]], i32 2
 ; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index 51a9cf19d49f..4c7e96544f44 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -10,8 +10,8 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @fadd_fsub_v8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -49,58 +49,11 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; SSE-NEXT:    ret <8 x float> [[R7]]
-;
-; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
-; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3
-; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4
-; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5
-; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6
-; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], [[B0]]
-; SLM-NEXT:    [[AB1:%.*]] = fdiv float [[A1]], [[B1]]
-; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], [[B2]]
-; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], [[B3]]
-; SLM-NEXT:    [[AB4:%.*]] = fmul float [[A4]], [[B4]]
-; SLM-NEXT:    [[AB5:%.*]] = fdiv float [[A5]], [[B5]]
-; SLM-NEXT:    [[AB6:%.*]] = fdiv float [[A6]], [[B6]]
-; SLM-NEXT:    [[AB7:%.*]] = fmul float [[A7]], [[B7]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
-; SLM-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX-NEXT:    ret <8 x float> [[R7]]
-;
-; AVX512-LABEL: @fmul_fdiv_v8f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
-; AVX512-NEXT:    ret <8 x float> [[R7]]
+; CHECK-LABEL: @fmul_fdiv_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -148,11 +101,8 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 1.000000e+00>
 ; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
-; SLM-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP4]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
+; SLM-NEXT:    [[R11:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R11]], float [[A2]], i32 2
 ; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
index c939789bffce..84c13464aa17 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
@@ -10,8 +10,8 @@ define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: @add_sub_v8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -52,8 +52,8 @@ define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_and_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x i32> [[R3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -78,8 +78,8 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_mul_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    ret <4 x i32> [[R3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -104,41 +104,34 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_shl_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <8 x i32> [[R7]]
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    ret <8 x i32> [[R72]]
+;
+; SLM-LABEL: @ashr_shl_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
-; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
-; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX1-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT:    [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX1-NEXT:    ret <8 x i32> [[R7]]
+; AVX1-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX1-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[R7]]
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX512-LABEL: @ashr_shl_v8i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -181,28 +174,34 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[R7]]
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R72]]
+;
+; SLM-LABEL: @ashr_shl_v8i32_const(
+; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
 ; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
 ; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:    ret <8 x i32> [[R7]]
+; AVX1-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    ret <8 x i32> [[R72]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32_const(
 ; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[R7]]
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX512-LABEL: @ashr_shl_v8i32_const(
 ; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -235,114 +234,105 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
 ; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
 ; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
 ; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
+; SLM-LABEL: @ashr_lshr_shl_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
+; SLM-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SLM-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R72]]
+;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
 ; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
 ; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
-; AVX1-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
 ; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; AVX1-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; AVX1-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX1-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
+; AVX1-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
 ; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
-; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX1-NEXT:    ret <8 x i32> [[R7]]
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX2-LABEL: @ashr_lshr_shl_v8i32(
-; AVX2-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
-; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
-; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
-; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX2-NEXT:    ret <8 x i32> [[R7]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
+; AVX2-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX2-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    ret <8 x i32> [[R72]]
 ;
 ; AVX512-LABEL: @ashr_lshr_shl_v8i32(
-; AVX512-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
-; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
-; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
-; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
+; AVX512-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX512-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    ret <8 x i32> [[R72]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -466,8 +456,8 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP5]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 0d6b791d79db..9a3973b4bba5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
@@ -10,8 +10,8 @@ define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: @add_sub_v8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -52,8 +52,8 @@ define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_and_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x i32> [[R3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -78,8 +78,8 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @add_mul_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    ret <4 x i32> [[R3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -104,41 +104,34 @@ define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_shl_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <8 x i32> [[R7]]
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    ret <8 x i32> [[R72]]
+;
+; SLM-LABEL: @ashr_shl_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
-; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
-; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; AVX1-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT:    [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 10, i32 11, i32 undef, i32 undef>
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX1-NEXT:    ret <8 x i32> [[R7]]
+; AVX1-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX1-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32(
 ; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[R7]]
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX512-LABEL: @ashr_shl_v8i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -181,28 +174,34 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[R7]]
+; SSE-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R72]]
+;
+; SLM-LABEL: @ashr_shl_v8i32_const(
+; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
 ; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
 ; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
-; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:    ret <8 x i32> [[R7]]
+; AVX1-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    ret <8 x i32> [[R72]]
 ;
 ; AVX2-LABEL: @ashr_shl_v8i32_const(
 ; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:    ret <8 x i32> [[R7]]
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX512-LABEL: @ashr_shl_v8i32_const(
 ; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
-; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -235,114 +234,105 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
 ; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
 ; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
 ; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; SSE-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
 ; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
+; SLM-LABEL: @ashr_lshr_shl_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
+; SLM-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; SLM-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R72]]
+;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
 ; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
 ; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
 ; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
-; AVX1-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
-; AVX1-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
 ; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
 ; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; AVX1-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; AVX1-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX1-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
+; AVX1-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
 ; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
-; AVX1-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
-; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
-; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
-; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
-; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX1-NEXT:    ret <8 x i32> [[R7]]
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX2-LABEL: @ashr_lshr_shl_v8i32(
-; AVX2-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
-; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
-; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
-; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX2-NEXT:    ret <8 x i32> [[R7]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
+; AVX2-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX2-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    ret <8 x i32> [[R72]]
 ;
 ; AVX512-LABEL: @ashr_lshr_shl_v8i32(
-; AVX512-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
-; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
-; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
-; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
-; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
-; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
-; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
-; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
-; AVX512-NEXT:    ret <8 x i32> [[R7]]
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
+; AVX512-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
+; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; AVX512-NEXT:    [[R72:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    ret <8 x i32> [[R72]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -466,8 +456,8 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP5]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
index 68a8c8684b68..2051bca86f08 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
@@ -14,11 +14,7 @@
 define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_2f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -34,11 +30,7 @@ define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_2f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -54,11 +46,7 @@ define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_2f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -74,11 +62,7 @@ define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: @buildvector_div_2f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; SSE-NEXT:    ret <2 x double> [[R1]]
+; SSE-NEXT:    ret <2 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_2f64(
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
@@ -93,19 +77,11 @@ define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: @buildvector_div_2f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; AVX-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; AVX-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; AVX-NEXT:    ret <2 x double> [[R1]]
+; AVX-NEXT:    ret <2 x double> [[TMP1]]
 ;
 ; AVX512-LABEL: @buildvector_div_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; AVX512-NEXT:    ret <2 x double> [[R1]]
+; AVX512-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -121,15 +97,7 @@ define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -153,15 +121,7 @@ define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -185,15 +145,7 @@ define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -217,15 +169,7 @@ define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -253,15 +197,7 @@ define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[R3]]
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -285,15 +221,7 @@ define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[R3]]
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -317,15 +245,7 @@ define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[R3]]
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -349,15 +269,7 @@ define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @buildvector_div_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; SSE-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; SSE-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; SSE-NEXT:    ret <4 x double> [[R3]]
+; SSE-NEXT:    ret <4 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_4f64(
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
@@ -380,27 +292,11 @@ define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX-LABEL: @buildvector_div_4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX-NEXT:    [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; AVX-NEXT:    ret <4 x double> [[R3]]
+; AVX-NEXT:    ret <4 x double> [[TMP1]]
 ;
 ; AVX512-LABEL: @buildvector_div_4f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; AVX512-NEXT:    ret <4 x double> [[R3]]
+; AVX512-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -424,23 +320,7 @@ define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -480,23 +360,7 @@ define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -536,23 +400,7 @@ define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -592,23 +440,7 @@ define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -652,23 +484,7 @@ define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_8f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x double> [[R7]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -708,23 +524,7 @@ define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x double> [[R7]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -764,23 +564,7 @@ define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x double> [[R7]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -820,23 +604,7 @@ define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; SSE-LABEL: @buildvector_div_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; SSE-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; SSE-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; SSE-NEXT:    ret <8 x double> [[R7]]
+; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
@@ -855,63 +623,43 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
 ; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
 ; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
-; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
-; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
-; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
-; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
-; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
-; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
-; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
-; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[C0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
-; SLM-NEXT:    ret <8 x double> [[R7]]
+; SLM-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
+; SLM-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
+; SLM-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; SLM-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1
+; SLM-NEXT:    [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
+; SLM-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1
+; SLM-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 0
+; SLM-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1
+; SLM-NEXT:    [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[A4]], i32 0
+; SLM-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[A5]], i32 1
+; SLM-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[B4]], i32 0
+; SLM-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[B5]], i32 1
+; SLM-NEXT:    [[TMP15:%.*]] = fdiv <2 x double> [[TMP12]], [[TMP14]]
+; SLM-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[A6]], i32 0
+; SLM-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[A7]], i32 1
+; SLM-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> poison, double [[B6]], i32 0
+; SLM-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[B7]], i32 1
+; SLM-NEXT:    [[TMP20:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP19]]
+; SLM-NEXT:    [[TMP21:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R11:%.*]] = shufflevector <8 x double> poison, <8 x double> [[TMP21]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R32:%.*]] = shufflevector <8 x double> [[R11]], <8 x double> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP23:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R53:%.*]] = shufflevector <8 x double> [[R32]], <8 x double> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SLM-NEXT:    [[TMP24:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R74:%.*]] = shufflevector <8 x double> [[R53]], <8 x double> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    ret <8 x double> [[R74]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; AVX-NEXT:    ret <8 x double> [[R7]]
+; AVX-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; AVX512-LABEL: @buildvector_div_8f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[TMP2]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; AVX512-NEXT:    ret <8 x double> [[R7]]
+; AVX512-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -951,39 +699,7 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1
@@ -1055,39 +771,7 @@ define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1
@@ -1159,39 +843,7 @@ define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1
@@ -1263,39 +915,7 @@ define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
index 7ed78eeecf28..88c282cea280 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -14,11 +14,7 @@
 define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_2f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -34,11 +30,7 @@ define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_2f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -54,11 +46,7 @@ define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_2f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    ret <2 x double> [[R1]]
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -74,11 +62,7 @@ define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: @buildvector_div_2f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; SSE-NEXT:    ret <2 x double> [[R1]]
+; SSE-NEXT:    ret <2 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_2f64(
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
@@ -93,19 +77,11 @@ define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: @buildvector_div_2f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; AVX-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; AVX-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; AVX-NEXT:    ret <2 x double> [[R1]]
+; AVX-NEXT:    ret <2 x double> [[TMP1]]
 ;
 ; AVX512-LABEL: @buildvector_div_2f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
-; AVX512-NEXT:    ret <2 x double> [[R1]]
+; AVX512-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -121,15 +97,7 @@ define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
 define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -153,15 +121,7 @@ define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -185,15 +145,7 @@ define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -217,15 +169,7 @@ define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[R3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
@@ -253,15 +197,7 @@ define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
 define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[R3]]
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -285,15 +221,7 @@ define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[R3]]
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -317,15 +245,7 @@ define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_4f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[R3]]
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -349,15 +269,7 @@ define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @buildvector_div_4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; SSE-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; SSE-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; SSE-NEXT:    ret <4 x double> [[R3]]
+; SSE-NEXT:    ret <4 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_4f64(
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
@@ -380,27 +292,11 @@ define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 ;
 ; AVX-LABEL: @buildvector_div_4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; AVX-NEXT:    ret <4 x double> [[R3]]
+; AVX-NEXT:    ret <4 x double> [[TMP1]]
 ;
 ; AVX512-LABEL: @buildvector_div_4f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
-; AVX512-NEXT:    ret <4 x double> [[R3]]
+; AVX512-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -424,23 +320,7 @@ define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
 define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -480,23 +360,7 @@ define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -536,23 +400,7 @@ define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -592,23 +440,7 @@ define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_8f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R7]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -652,23 +484,7 @@ define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
 define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_add_8f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x double> [[R7]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -708,23 +524,7 @@ define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_sub_8f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x double> [[R7]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -764,23 +564,7 @@ define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: @buildvector_mul_8f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x double> [[R7]]
+; CHECK-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -820,23 +604,7 @@ define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; SSE-LABEL: @buildvector_div_8f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; SSE-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; SSE-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; SSE-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; SSE-NEXT:    ret <8 x double> [[R7]]
+; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
@@ -855,63 +623,43 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
 ; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
 ; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
-; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
-; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
-; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
-; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
-; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
-; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
-; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
-; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
-; SLM-NEXT:    ret <8 x double> [[R7]]
+; SLM-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
+; SLM-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A1]], i32 1
+; SLM-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; SLM-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B1]], i32 1
+; SLM-NEXT:    [[TMP5:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
+; SLM-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A3]], i32 1
+; SLM-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> poison, double [[B2]], i32 0
+; SLM-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B3]], i32 1
+; SLM-NEXT:    [[TMP10:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[A4]], i32 0
+; SLM-NEXT:    [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[A5]], i32 1
+; SLM-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[B4]], i32 0
+; SLM-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[B5]], i32 1
+; SLM-NEXT:    [[TMP15:%.*]] = fdiv <2 x double> [[TMP12]], [[TMP14]]
+; SLM-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[A6]], i32 0
+; SLM-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[A7]], i32 1
+; SLM-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> poison, double [[B6]], i32 0
+; SLM-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[TMP18]], double [[B7]], i32 1
+; SLM-NEXT:    [[TMP20:%.*]] = fdiv <2 x double> [[TMP17]], [[TMP19]]
+; SLM-NEXT:    [[TMP21:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R11:%.*]] = shufflevector <8 x double> undef, <8 x double> [[TMP21]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R32:%.*]] = shufflevector <8 x double> [[R11]], <8 x double> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP23:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R53:%.*]] = shufflevector <8 x double> [[R32]], <8 x double> [[TMP23]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SLM-NEXT:    [[TMP24:%.*]] = shufflevector <2 x double> [[TMP20]], <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R74:%.*]] = shufflevector <8 x double> [[R53]], <8 x double> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    ret <8 x double> [[R74]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; AVX-NEXT:    ret <8 x double> [[R7]]
+; AVX-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; AVX512-LABEL: @buildvector_div_8f64(
 ; AVX512-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
-; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
-; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
-; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
-; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
-; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
-; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
-; AVX512-NEXT:    ret <8 x double> [[R7]]
+; AVX512-NEXT:    ret <8 x double> [[TMP1]]
 ;
   %a0 = extractelement <8 x double> %a, i32 0
   %a1 = extractelement <8 x double> %a, i32 1
@@ -951,39 +699,7 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_add_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1
@@ -1055,39 +771,7 @@ define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
 define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_sub_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1
@@ -1159,39 +843,7 @@ define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
 define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_mul_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1
@@ -1263,39 +915,7 @@ define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
 define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: @buildvector_div_16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
-; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
-; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
-; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
-; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
-; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
-; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
-; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
-; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
-; CHECK-NEXT:    ret <16 x float> [[R15]]
+; CHECK-NEXT:    ret <16 x float> [[TMP1]]
 ;
   %a0  = extractelement <16 x float> %a, i32 0
   %a1  = extractelement <16 x float> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
index 68c5bd850590..98fe1dfc1a76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -22,19 +22,9 @@ define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 
 define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x i8> poison, i8 [[X0X0]], i32 0
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
-; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
-; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
-; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -53,16 +43,9 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
 
 define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h_undef(
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> <i8 undef, i8 poison, i8 poison, i8 poison>, i8 [[X3X3]], i32 1
-; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
-; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
-; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
 ;
   %x0 = extractelement <4 x i8> undef, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 776f32dee743..210ae5d39a50 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -22,19 +22,9 @@ define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 
 define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
-; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
-; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
-; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
 ;
   %x0 = extractelement <4 x i8> %x, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
@@ -53,16 +43,9 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
 
 define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h_undef(
-; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
-; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
-; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
-; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1
-; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
-; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
-; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
 ;
   %x0 = extractelement <4 x i8> undef, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll
index f1ece799f1b0..949572676a34 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load-inseltpoison.ll
@@ -13,15 +13,7 @@ define void @hoge(i64 %idx, <4 x i32>* %sink) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3
-; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]], align 16
+; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* [[SINK:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
index 4a4b7d006888..9cb6bc56b9e2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
@@ -13,15 +13,7 @@ define void @hoge(i64 %idx, <4 x i32>* %sink) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3
-; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]], align 16
+; CHECK-NEXT:    store <4 x i32> [[SHUFFLE]], <4 x i32>* [[SINK:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
 bb:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
index 8bc8639a0eb6..ed071f90ad76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll
@@ -487,15 +487,12 @@ define void @fptosi_8f32_8i8() #0 {
 
 define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 {
 ; CHECK-LABEL: @fptosi_4xf64_4i32(
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0:%.*]] to i32
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1:%.*]] to i32
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2:%.*]] to i32
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3:%.*]] to i32
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi <4 x double> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %cvt0 = fptosi double %a0 to i32
   %cvt1 = fptosi double %a1 to i32
@@ -510,15 +507,12 @@ define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %
 
 define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 {
 ; CHECK-LABEL: @fptosi_4xf32_4i32(
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0:%.*]] to i32
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1:%.*]] to i32
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2:%.*]] to i32
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3:%.*]] to i32
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> poison, i32 [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %cvt0 = fptosi float %a0 to i32
   %cvt1 = fptosi float %a1 to i32

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
index aeb485d8466b..8d46e40fc5a9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -487,15 +487,12 @@ define void @fptosi_8f32_8i8() #0 {
 
 define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 {
 ; CHECK-LABEL: @fptosi_4xf64_4i32(
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0:%.*]] to i32
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1:%.*]] to i32
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2:%.*]] to i32
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3:%.*]] to i32
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi <4 x double> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %cvt0 = fptosi double %a0 to i32
   %cvt1 = fptosi double %a1 to i32
@@ -510,15 +507,12 @@ define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %
 
 define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 {
 ; CHECK-LABEL: @fptosi_4xf32_4i32(
-; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0:%.*]] to i32
-; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1:%.*]] to i32
-; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2:%.*]] to i32
-; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3:%.*]] to i32
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %cvt0 = fptosi float %a0 to i32
   %cvt1 = fptosi float %a1 to i32

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 97775135d025..bc6eeb1cdd7f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -11,28 +11,11 @@
 ;
 
 define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
-; SSE-LABEL: @test_v2f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x double> [[TMP3]]
-;
-; SLM-LABEL: @test_v2f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> poison, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    ret <2 x double> [[R01]]
-;
-; AVX-LABEL: @test_v2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-LABEL: @test_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -169,27 +152,18 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R032]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R032]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -230,8 +204,8 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
 ; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[R07]]
+; SLM-NEXT:    [[R072:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R072]]
 ;
 ; AVX-LABEL: @test_v8f32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -350,8 +324,8 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
 ; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[RV152:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV152]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index a3a18a2e5a56..b9d55ecb27eb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -11,28 +11,11 @@
 ;
 
 define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
-; SSE-LABEL: @test_v2f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x double> [[TMP3]]
-;
-; SLM-LABEL: @test_v2f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> undef, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    ret <2 x double> [[R01]]
-;
-; AVX-LABEL: @test_v2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-LABEL: @test_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -169,27 +152,18 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R032]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
-; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R032]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -230,8 +204,8 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
 ; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[R07]]
+; SLM-NEXT:    [[R072:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R072]]
 ;
 ; AVX-LABEL: @test_v8f32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -350,8 +324,8 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
 ; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[RV152:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV152]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 51711dc352fa..3d165eb00b90 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -11,28 +11,11 @@
 ;
 
 define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
-; SSE-LABEL: @test_v2f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x double> [[TMP3]]
-;
-; SLM-LABEL: @test_v2f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> poison, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    ret <2 x double> [[R01]]
-;
-; AVX-LABEL: @test_v2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-LABEL: @test_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -169,27 +152,18 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R032]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
-; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R032]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -230,8 +204,8 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
 ; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[R07]]
+; SLM-NEXT:    [[R072:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R072]]
 ;
 ; AVX-LABEL: @test_v8f32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -350,8 +324,8 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[RV152:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV152]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 5495c73aaf10..9ff8142b269d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -11,28 +11,11 @@
 ;
 
 define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
-; SSE-LABEL: @test_v2f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x double> [[TMP3]]
-;
-; SLM-LABEL: @test_v2f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> undef, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    ret <2 x double> [[R01]]
-;
-; AVX-LABEL: @test_v2f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-LABEL: @test_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
 ;
   %a0 = extractelement <2 x double> %a, i32 0
   %a1 = extractelement <2 x double> %a, i32 1
@@ -169,27 +152,18 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[R03]]
+; SSE-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R032]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
-; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
-; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
-; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
-; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R032:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R032]]
 ;
 ; AVX-LABEL: @test_v4f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -230,8 +204,8 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
 ; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[R07]]
+; SLM-NEXT:    [[R072:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R072]]
 ;
 ; AVX-LABEL: @test_v8f32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -350,8 +324,8 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[RV15]]
+; SSE-NEXT:    [[RV152:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV152]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
index e13f1b196f61..45c4503dff3e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll
@@ -9,15 +9,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 ; CHECK-LABEL: @simple_select(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -131,15 +123,7 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[SHUFFLE]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[SHUFFLE1]], <4 x float> [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -176,16 +160,8 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
 ; CHECK-LABEL: @simple_select_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
-; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[RD]]) [[ATTR0:#.*]]
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -246,15 +222,11 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> poison, float [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> poison, float [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RB2:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP17]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[RD1]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -312,11 +284,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; CHECK-LABEL: @simple_select_v2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    ret <2 x float> [[RB]]
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
   %c0 = extractelement <2 x i32> %c, i32 0
   %c1 = extractelement <2 x i32> %c, i32 1
@@ -384,15 +352,7 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
 define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @reschedule_extract(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -418,15 +378,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @take_credit(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -456,15 +408,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[W:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double 3.000000e+00, double 2.000000e+00, double 1.000000e+00, double 0.000000e+00>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 3
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> poison, double [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 2
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP10]], i32 0
-; CHECK-NEXT:    ret <4 x double> [[I4]]
+; CHECK-NEXT:    ret <4 x double> [[TMP6]]
 ;
   %t0 = fadd double %w , 0.000000e+00
   %t1 = fadd double %x , 1.000000e+00
@@ -484,23 +428,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[VECINIT7_I]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %b, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 9f7d8e17d267..2e9f4f6cfc00 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -9,15 +9,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 ; CHECK-LABEL: @simple_select(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -131,15 +123,7 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[SHUFFLE]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[SHUFFLE1]], <4 x float> [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -176,16 +160,8 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
 ; CHECK-LABEL: @simple_select_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
-; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[RD]]) [[ATTR0:#.*]]
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -246,15 +222,11 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
-; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
-; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RD]]
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RB2:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP17]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[RD1]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
   %c1 = extractelement <4 x i32> %c, i32 1
@@ -312,11 +284,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; CHECK-LABEL: @simple_select_v2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1
-; CHECK-NEXT:    ret <2 x float> [[RB]]
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
   %c0 = extractelement <2 x i32> %c, i32 0
   %c1 = extractelement <2 x i32> %c, i32 1
@@ -384,15 +352,7 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
 define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @reschedule_extract(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -418,15 +378,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @take_credit(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
@@ -456,15 +408,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[W:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double 3.000000e+00, double 2.000000e+00, double 1.000000e+00, double 0.000000e+00>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP6]], i32 3
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP6]], i32 2
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP10]], i32 0
-; CHECK-NEXT:    ret <4 x double> [[I4]]
+; CHECK-NEXT:    ret <4 x double> [[TMP6]]
 ;
   %t0 = fadd double %w , 0.000000e+00
   %t1 = fadd double %x , 1.000000e+00
@@ -484,23 +428,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
-; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
-; CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 5
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
-; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 6
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
-; CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[VECINIT7_I]]
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %vecext = extractelement <8 x float> %a, i32 0
   %vecext1 = extractelement <8 x float> %b, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 078a965656c9..8ad80f7e7d28 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -57,11 +57,9 @@ define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceabl
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[X2:%.*]] = load float, float* [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I11:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP3]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I11]], float [[X2]], i32 2
 ; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[I3]]
 ;
@@ -85,13 +83,7 @@ define <4 x float> @PR16739_byref_alt(<4 x float>* nocapture readonly dereferenc
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[I3]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
 ;
   %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
   %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 793ccb5b4444..39bf185a5bca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -57,11 +57,9 @@ define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceabl
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[X2:%.*]] = load float, float* [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I11:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP3]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I11]], float [[X2]], i32 2
 ; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[I3]]
 ;
@@ -85,13 +83,7 @@ define <4 x float> @PR16739_byref_alt(<4 x float>* nocapture readonly dereferenc
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0
-; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[I3]]
+; CHECK-NEXT:    ret <4 x float> [[SHUFFLE]]
 ;
   %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
   %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll
index b29f64165a79..357c49df8668 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll
@@ -12,23 +12,19 @@ define i32 @test(double* nocapture %A, i8* nocapture %B) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], <i8 3, i8 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i8> poison, i8 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> [[TMP5]], i8 [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP13]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[TMP14]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> [[TMP16]], <double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index ced5b1d49bce..a7819f9b3ea4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -183,11 +183,11 @@ define void @vecload_vs_broadcast5(double * noalias %from, double * noalias %to,
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 8783c16d1168..f76388d93ae9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -150,8 +150,8 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP17:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
@@ -163,14 +163,14 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11]] = extractelement <2 x float> [[SHUFFLE1]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13]] = extractelement <2 x float> [[SHUFFLE1]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
-; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[SHUFFLE1]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP11]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    [[TMP16]] = extractelement <2 x float> [[SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[TMP17]] = extractelement <2 x float> [[SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
index 448480d23b41..6ed423173e54 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
@@ -6,11 +6,7 @@ define <2 x float> @foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SOURCE:%.*]] = insertelement <2 x float> poison, float undef, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1
-; CHECK-NEXT:    ret <2 x float> [[RES2]]
+; CHECK-NEXT:    ret <2 x float> [[TMP0]]
 ;
 entry:
   %source = insertelement <2 x float> poison, float undef, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
index 64e0f7be7e2e..fec558e82190 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
@@ -6,11 +6,7 @@ define <2 x float> @foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SOURCE:%.*]] = insertelement <2 x float> undef, float undef, i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1
-; CHECK-NEXT:    ret <2 x float> [[RES2]]
+; CHECK-NEXT:    ret <2 x float> [[TMP0]]
 ;
 entry:
   %source = insertelement <2 x float> undef, float undef, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
index dc583bed1ad7..517f9065766e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
@@ -31,15 +31,12 @@ entry:
 define void @test1_vec(float %a, float %b, float %c, float %d, <4 x i32>* nocapture %p) {
 ; CHECK-LABEL: @test1_vec(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[A:%.*]] to i32
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[CONV]], i32 0
-; CHECK-NEXT:    [[CONV1:%.*]] = fptosi float [[B:%.*]] to i32
-; CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[CONV1]], i32 1
-; CHECK-NEXT:    [[CONV3:%.*]] = fptosi float [[C:%.*]] to i32
-; CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[CONV3]], i32 2
-; CHECK-NEXT:    [[CONV5:%.*]] = fptosi float [[D:%.*]] to i32
-; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT4]], i32 [[CONV5]], i32 3
-; CHECK-NEXT:    store <4 x i32> [[VECINIT6]], <4 x i32>* [[P:%.*]], align 16, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[P:%.*]], align 16, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -84,15 +81,12 @@ entry:
 
 define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, <4 x i32>* nocapture %4) {
 ; CHECK-LABEL: @test2_vec(
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP0:%.*]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[TMP1:%.*]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP2:%.*]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP3:%.*]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3
-; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP4:%.*]], align 16, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP4:%.*]], align 16, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
   %6 = add nsw i32 %0, 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll
index 35a296820b3b..c7f600547b71 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr44067-inseltpoison.ll
@@ -9,11 +9,7 @@ define <2 x float> @foo({{float, float}}* %A) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast { { float, float } }* [[A:%.*]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[INS0:%.*]] = insertelement <2 x float> [[INS1]], float [[TMP4]], i32 0
-; CHECK-NEXT:    ret <2 x float> [[INS0]]
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
 entry:
   %0 = bitcast {{float, float}}* %A to <2 x float>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll
index 7b74dbb393eb..304bad09d208 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr44067.ll
@@ -9,11 +9,7 @@ define <2 x float> @foo({{float, float}}* %A) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast { { float, float } }* [[A:%.*]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[INS0:%.*]] = insertelement <2 x float> [[INS1]], float [[TMP4]], i32 0
-; CHECK-NEXT:    ret <2 x float> [[INS0]]
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
 entry:
   %0 = bitcast {{float, float}}* %A to <2 x float>*

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 65f896e57ba6..17e0c6ed2d43 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -528,77 +528,72 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 17, i64 8>
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 5, i64 20>
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> [[TMP9]], float* [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr float, <8 x float*> [[TMP18]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX-NEXT:    [[TMP20:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP19]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = fdiv <8 x float> [[TMP17]], [[TMP20]]
+; AVX-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX-NEXT:    store <8 x float> [[TMP21]], <8 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
 ; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX2-NEXT:    [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]]
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX2-NEXT:    store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
 ; AVX512-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
 ; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX512-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX512-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512-NEXT:    [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]]
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512-NEXT:    store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = load float, float* %1, align 4, !tbaa !2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 4ae3dccca473..8d9cbbb3ae5a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -528,77 +528,72 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 17, i64 8>
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 5, i64 20>
+; AVX-NEXT:    [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> [[TMP9]], float* [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP18:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr float, <8 x float*> [[TMP18]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX-NEXT:    [[TMP20:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP19]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = fdiv <8 x float> [[TMP17]], [[TMP20]]
+; AVX-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX-NEXT:    store <8 x float> [[TMP21]], <8 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
 ; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
 ; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX2-NEXT:    [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]]
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX2-NEXT:    store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512-LABEL: @gather_load_div(
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
-; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
-; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
-; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
 ; AVX512-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
 ; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
-; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
-; AVX512-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
-; AVX512-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
-; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
-; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
-; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
-; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
-; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
-; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
-; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
-; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:    [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT:    [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
+; AVX512-NEXT:    [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
+; AVX512-NEXT:    [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]]
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
+; AVX512-NEXT:    store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]]
 ; AVX512-NEXT:    ret void
 ;
   %3 = load float, float* %1, align 4, !tbaa !2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index 8ef2f322baba..7081f1e99a95 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -12,70 +12,57 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP4]], <i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
-; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
-; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
-; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
-; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <4 x i32> [[TMP4]], <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_I_I]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_1_I_I]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[SHR_2_I_I]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i32> [[TMP14]], <16 x i32> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i32> [[TMP16]], <16 x i32> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc <16 x i32> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i8> [[TMP19]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
-; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP31]], <16 x i8>* [[TMP32]], align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll
index d1690f4b3ac9..f7fbb378f893 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext-inseltpoison.ll
@@ -13,24 +13,17 @@
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 ; SSE-LABEL: @loadext_2i8_to_2i64(
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %i0 = load i8, i8* %p0, align 1
@@ -43,40 +36,14 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -85,15 +52,7 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -118,19 +77,10 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
 ; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; SSE-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -139,15 +89,7 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -168,68 +110,18 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 }
 
 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; SSE2-LABEL: @loadext_8i8_to_8i16(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i16> [[V7]]
-;
-; SLM-LABEL: @loadext_8i8_to_8i16(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i16> [[V7]]
+; SSE-LABEL: @loadext_8i8_to_8i16(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE-NEXT:    ret <8 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -242,23 +134,7 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i16> [[V7]]
+; AVX-NEXT:    ret <8 x i16> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -295,68 +171,18 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 }
 
 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; SSE2-LABEL: @loadext_8i8_to_8i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
-;
-; SLM-LABEL: @loadext_8i8_to_8i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SSE-LABEL: @loadext_8i8_to_8i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -369,23 +195,7 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -422,124 +232,26 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 }
 
 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; SSE2-LABEL: @loadext_16i8_to_16i16(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SSE2-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; SSE2-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; SSE2-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; SSE2-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; SSE2-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; SSE2-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; SSE2-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; SSE2-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; SSE2-NEXT:    ret <16 x i16> [[V15]]
-;
-; SLM-LABEL: @loadext_16i8_to_16i16(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; SLM-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; SLM-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; SLM-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; SLM-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
-; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
-; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
-; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
-; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
-; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
-; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
-; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
-; SLM-NEXT:    [[X8:%.*]] = sext i8 [[I8]] to i16
-; SLM-NEXT:    [[X9:%.*]] = sext i8 [[I9]] to i16
-; SLM-NEXT:    [[X10:%.*]] = sext i8 [[I10]] to i16
-; SLM-NEXT:    [[X11:%.*]] = sext i8 [[I11]] to i16
-; SLM-NEXT:    [[X12:%.*]] = sext i8 [[I12]] to i16
-; SLM-NEXT:    [[X13:%.*]] = sext i8 [[I13]] to i16
-; SLM-NEXT:    [[X14:%.*]] = sext i8 [[I14]] to i16
-; SLM-NEXT:    [[X15:%.*]] = sext i8 [[I15]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
-; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
-; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
-; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
-; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
-; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
-; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
-; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
-; SLM-NEXT:    ret <16 x i16> [[V15]]
+; SSE-LABEL: @loadext_16i8_to_16i16(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_16i8_to_16i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -560,39 +272,7 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; AVX-NEXT:    ret <16 x i16> [[V15]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -667,24 +347,17 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 ; SSE-LABEL: @loadext_2i16_to_2i64(
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %i0 = load i16, i16* %p0, align 1
@@ -697,40 +370,14 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 }
 
 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -739,15 +386,7 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -768,23 +407,31 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE-LABEL: @loadext_4i16_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE2-NEXT:    [[V11:%.*]] = shufflevector <4 x i64> poison, <4 x i64> [[TMP4]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V11]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -793,15 +440,7 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -822,68 +461,18 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 }
 
 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; SSE2-LABEL: @loadext_8i16_to_8i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
-;
-; SLM-LABEL: @loadext_8i16_to_8i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = sext i16 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = sext i16 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = sext i16 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = sext i16 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SSE-LABEL: @loadext_8i16_to_8i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i16_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -896,23 +485,7 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -955,24 +528,17 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 ; SSE-LABEL: @loadext_2i32_to_2i64(
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %i0 = load i32, i32* %p0, align 1
@@ -989,19 +555,10 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
 ; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; SSE-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i32_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -1010,15 +567,7 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
index c80181781608..ae34d79542c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -13,24 +13,17 @@
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 ; SSE-LABEL: @loadext_2i8_to_2i64(
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %i0 = load i8, i8* %p0, align 1
@@ -43,40 +36,14 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -85,15 +52,7 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -118,19 +77,10 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
 ; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; SSE-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -139,15 +89,7 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -168,68 +110,18 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 }
 
 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; SSE2-LABEL: @loadext_8i8_to_8i16(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i16> [[V7]]
-;
-; SLM-LABEL: @loadext_8i8_to_8i16(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i16> [[V7]]
+; SSE-LABEL: @loadext_8i8_to_8i16(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE-NEXT:    ret <8 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -242,23 +134,7 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i16> [[V7]]
+; AVX-NEXT:    ret <8 x i16> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -295,68 +171,18 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 }
 
 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; SSE2-LABEL: @loadext_8i8_to_8i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
-;
-; SLM-LABEL: @loadext_8i8_to_8i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SSE-LABEL: @loadext_8i8_to_8i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -369,23 +195,7 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -422,124 +232,26 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 }
 
 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; SSE2-LABEL: @loadext_16i8_to_16i16(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SSE2-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; SSE2-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; SSE2-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; SSE2-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; SSE2-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; SSE2-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; SSE2-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; SSE2-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; SSE2-NEXT:    ret <16 x i16> [[V15]]
-;
-; SLM-LABEL: @loadext_16i8_to_16i16(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; SLM-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; SLM-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; SLM-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; SLM-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
-; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
-; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
-; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
-; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
-; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
-; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
-; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
-; SLM-NEXT:    [[X8:%.*]] = sext i8 [[I8]] to i16
-; SLM-NEXT:    [[X9:%.*]] = sext i8 [[I9]] to i16
-; SLM-NEXT:    [[X10:%.*]] = sext i8 [[I10]] to i16
-; SLM-NEXT:    [[X11:%.*]] = sext i8 [[I11]] to i16
-; SLM-NEXT:    [[X12:%.*]] = sext i8 [[I12]] to i16
-; SLM-NEXT:    [[X13:%.*]] = sext i8 [[I13]] to i16
-; SLM-NEXT:    [[X14:%.*]] = sext i8 [[I14]] to i16
-; SLM-NEXT:    [[X15:%.*]] = sext i8 [[I15]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
-; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
-; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
-; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
-; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
-; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
-; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
-; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
-; SLM-NEXT:    ret <16 x i16> [[V15]]
+; SSE-LABEL: @loadext_16i8_to_16i16(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_16i8_to_16i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -560,39 +272,7 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; AVX-NEXT:    ret <16 x i16> [[V15]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -667,24 +347,17 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 ; SSE-LABEL: @loadext_2i16_to_2i64(
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %i0 = load i16, i16* %p0, align 1
@@ -697,40 +370,14 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 }
 
 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -739,15 +386,7 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -768,23 +407,31 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE-LABEL: @loadext_4i16_to_4i64(
-; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; SSE2-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE2-NEXT:    [[V11:%.*]] = shufflevector <4 x i64> undef, <4 x i64> [[TMP4]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V11]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -793,15 +440,7 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -822,68 +461,18 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 }
 
 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; SSE2-LABEL: @loadext_8i16_to_8i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
-;
-; SLM-LABEL: @loadext_8i16_to_8i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = sext i16 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = sext i16 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = sext i16 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = sext i16 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SSE-LABEL: @loadext_8i16_to_8i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i16_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -896,23 +485,7 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -955,24 +528,17 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 ; SSE-LABEL: @loadext_2i32_to_2i64(
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    ret <2 x i64> [[V1]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %i0 = load i32, i32* %p0, align 1
@@ -989,19 +555,10 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
 ; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
-; SSE-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
-; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE-NEXT:    ret <4 x i64> [[V3]]
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; SSE-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i32_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -1010,15 +567,7 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll
index 296486b49e14..7916027aa841 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend-inseltpoison.ll
@@ -5,15 +5,7 @@ define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) {
 ; CHECK-LABEL: @sign_extend_v_v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
-; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[VECINIT9]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <4 x i16> %lhs, i32 0
@@ -35,15 +27,7 @@ define <4 x i16> @truncate_v_v(<4 x i32> %lhs) {
 ; CHECK-LABEL: @truncate_v_v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
-; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
-; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x i16> [[VECINIT9]]
+; CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <4 x i32> %lhs, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
index 360ce6ab0769..f76ddbc4f92d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
@@ -5,15 +5,7 @@ define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) {
 ; CHECK-LABEL: @sign_extend_v_v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
-; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[VECINIT9]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <4 x i16> %lhs, i32 0
@@ -35,15 +27,7 @@ define <4 x i16> @truncate_v_v(<4 x i32> %lhs) {
 ; CHECK-LABEL: @truncate_v_v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
-; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
-; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x i16> [[VECINIT9]]
+; CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 ;
 entry:
   %vecext = extractelement <4 x i32> %lhs, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
index 7718bcd727a3..7b4677cc7e19 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll
@@ -1283,16 +1283,24 @@ define void @sitofp_16i8_16f32() #0 {
 ;
 
 define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
-; CHECK-LABEL: @sitofp_4xi32_4f64(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[RES3]]
+; SSE-LABEL: @sitofp_4xi32_4f64(
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
+; SSE-NEXT:    [[RES0:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
+; SSE-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
+; SSE-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
+; SSE-NEXT:    [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3
+; SSE-NEXT:    ret <4 x double> [[RES3]]
+;
+; AVX-LABEL: @sitofp_4xi32_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x double>
+; AVX-NEXT:    ret <4 x double> [[TMP5]]
 ;
   %cvt0 = sitofp i32 %a0 to double
   %cvt1 = sitofp i32 %a1 to double
@@ -1306,16 +1314,24 @@ define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 }
 
 define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
-; CHECK-LABEL: @sitofp_4xi32_4f32(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RES3]]
+; SSE-LABEL: @sitofp_4xi32_4f32(
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
+; SSE-NEXT:    [[RES0:%.*]] = insertelement <4 x float> poison, float [[CVT0]], i32 0
+; SSE-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3
+; SSE-NEXT:    ret <4 x float> [[RES3]]
+;
+; AVX-LABEL: @sitofp_4xi32_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
+; AVX-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %cvt0 = sitofp i32 %a0 to float
   %cvt1 = sitofp i32 %a1 to float

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
index 9d3c66d3d7b9..f187f6385f57 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -1283,16 +1283,24 @@ define void @sitofp_16i8_16f32() #0 {
 ;
 
 define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
-; CHECK-LABEL: @sitofp_4xi32_4f64(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x double> [[RES3]]
+; SSE-LABEL: @sitofp_4xi32_4f64(
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
+; SSE-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
+; SSE-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
+; SSE-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
+; SSE-NEXT:    [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3
+; SSE-NEXT:    ret <4 x double> [[RES3]]
+;
+; AVX-LABEL: @sitofp_4xi32_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x double>
+; AVX-NEXT:    ret <4 x double> [[TMP5]]
 ;
   %cvt0 = sitofp i32 %a0 to double
   %cvt1 = sitofp i32 %a1 to double
@@ -1306,16 +1314,24 @@ define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
 }
 
 define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
-; CHECK-LABEL: @sitofp_4xi32_4f32(
-; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
-; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
-; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
-; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
-; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
-; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
-; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
-; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[RES3]]
+; SSE-LABEL: @sitofp_4xi32_4f32(
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
+; SSE-NEXT:    [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; SSE-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
+; SSE-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
+; SSE-NEXT:    [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3
+; SSE-NEXT:    ret <4 x float> [[RES3]]
+;
+; AVX-LABEL: @sitofp_4xi32_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A0:%.*]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A2:%.*]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[A3:%.*]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
+; AVX-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %cvt0 = sitofp i32 %a0 to float
   %cvt1 = sitofp i32 %a1 to float

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
index 7a63fbc0ae33..fc41ec9e9386 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug-inseltpoison.ll
@@ -11,32 +11,30 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb279:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float undef, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1
 ; CHECK-NEXT:    br label [[BB283:%.*]]
 ; CHECK:       bb283:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ undef, [[EXIT]] ]
 ; CHECK-NEXT:    br label [[BB284:%.*]]
 ; CHECK:       bb284:
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
 ; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
-; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], undef
+; CHECK-NEXT:    [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
 bb279:
@@ -93,9 +91,7 @@ exit:
 define <4 x double> @constant_folding() {
 ; CHECK-LABEL: @constant_folding(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> poison, double 1.000000e+00, i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double 2.000000e+00, i32 0
-; CHECK-NEXT:    ret <4 x double> [[I2]]
+; CHECK-NEXT:    ret <4 x double> <double 2.000000e+00, double 1.000000e+00, double poison, double poison>
 ;
 entry:
   %t0 = fadd double 1.000000e+00 , 0.000000e+00

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
index f5acc5a37975..cdde0971b6df 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -11,32 +11,30 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  bb279:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float undef, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1
 ; CHECK-NEXT:    br label [[BB283:%.*]]
 ; CHECK:       bb283:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ undef, [[EXIT]] ]
 ; CHECK-NEXT:    br label [[BB284:%.*]]
 ; CHECK:       bb284:
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    br label [[BB21_I:%.*]]
 ; CHECK:       bb21.i:
 ; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
 ; CHECK:       bb22.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
 ; CHECK-NEXT:    br label [[BB32_I:%.*]]
 ; CHECK:       bb32.i:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
 ; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
-; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP9]], undef
+; CHECK-NEXT:    [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
 bb279:
@@ -93,9 +91,7 @@ exit:
 define <4 x double> @constant_folding() {
 ; CHECK-LABEL: @constant_folding(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double 1.000000e+00, i32 1
-; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double 2.000000e+00, i32 0
-; CHECK-NEXT:    ret <4 x double> [[I2]]
+; CHECK-NEXT:    ret <4 x double> <double 2.000000e+00, double 1.000000e+00, double undef, double undef>
 ;
 entry:
   %t0 = fadd double 1.000000e+00 , 0.000000e+00

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll
index c16a988565ba..31f1c7c087d7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext-inseltpoison.ll
@@ -16,32 +16,21 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_2i8_to_2i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %i0 = load i8, i8* %p0, align 1
@@ -61,33 +50,16 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i8_to_4i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -96,15 +68,7 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -132,33 +96,16 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i8_to_4i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -167,15 +114,7 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -207,23 +146,7 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i16> [[V7]]
+; SSE2-NEXT:    ret <8 x i16> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_8i8_to_8i16(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -233,31 +156,10 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
 ; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
 ; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i16> [[V7]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; SLM-NEXT:    ret <8 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -270,23 +172,7 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i16> [[V7]]
+; AVX-NEXT:    ret <8 x i16> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -334,23 +220,7 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_8i8_to_8i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -360,31 +230,10 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
 ; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
 ; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -397,23 +246,7 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -469,39 +302,7 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; SSE2-NEXT:    ret <16 x i16> [[V15]]
+; SSE2-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_16i8_to_16i16(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -519,55 +320,10 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
 ; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
 ; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
-; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
-; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
-; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
-; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
-; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
-; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
-; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
-; SLM-NEXT:    [[X8:%.*]] = zext i8 [[I8]] to i16
-; SLM-NEXT:    [[X9:%.*]] = zext i8 [[I9]] to i16
-; SLM-NEXT:    [[X10:%.*]] = zext i8 [[I10]] to i16
-; SLM-NEXT:    [[X11:%.*]] = zext i8 [[I11]] to i16
-; SLM-NEXT:    [[X12:%.*]] = zext i8 [[I12]] to i16
-; SLM-NEXT:    [[X13:%.*]] = zext i8 [[I13]] to i16
-; SLM-NEXT:    [[X14:%.*]] = zext i8 [[I14]] to i16
-; SLM-NEXT:    [[X15:%.*]] = zext i8 [[I15]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
-; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
-; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
-; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
-; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
-; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
-; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
-; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
-; SLM-NEXT:    ret <16 x i16> [[V15]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; SLM-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_16i8_to_16i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -588,39 +344,7 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> poison, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; AVX-NEXT:    ret <16 x i16> [[V15]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -698,32 +422,21 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_2i16_to_2i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %i0 = load i16, i16* %p0, align 1
@@ -743,33 +456,16 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i16_to_4i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -778,15 +474,7 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -814,33 +502,16 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i16_to_4i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -849,15 +520,7 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -889,23 +552,7 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_8i16_to_8i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -915,31 +562,10 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
 ; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
 ; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = zext i16 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = zext i16 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = zext i16 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = zext i16 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i16_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -952,23 +578,7 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -1014,32 +624,21 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_2i32_to_2i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %i0 = load i32, i32* %p0, align 1
@@ -1059,33 +658,16 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i32_to_4i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SLM-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; SLM-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i32_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -1094,15 +676,7 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
index 787dc2d48928..4bf34ff0c406 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -16,32 +16,21 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_2i8_to_2i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %i0 = load i8, i8* %p0, align 1
@@ -61,33 +50,16 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i8_to_4i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -96,15 +68,7 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -132,33 +96,16 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i8_to_4i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -167,15 +114,7 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -207,23 +146,7 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i16> [[V7]]
+; SSE2-NEXT:    ret <8 x i16> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_8i8_to_8i16(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -233,31 +156,10 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
 ; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
 ; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i16> [[V7]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; SLM-NEXT:    ret <8 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -270,23 +172,7 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i16> [[V7]]
+; AVX-NEXT:    ret <8 x i16> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -334,23 +220,7 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_8i8_to_8i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -360,31 +230,10 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
 ; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
 ; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i8_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -397,23 +246,7 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -469,39 +302,7 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; SSE2-NEXT:    ret <16 x i16> [[V15]]
+; SSE2-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_16i8_to_16i16(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -519,55 +320,10 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
 ; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
 ; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
-; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
-; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
-; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
-; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
-; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
-; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
-; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
-; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
-; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
-; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
-; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
-; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
-; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
-; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
-; SLM-NEXT:    [[X8:%.*]] = zext i8 [[I8]] to i16
-; SLM-NEXT:    [[X9:%.*]] = zext i8 [[I9]] to i16
-; SLM-NEXT:    [[X10:%.*]] = zext i8 [[I10]] to i16
-; SLM-NEXT:    [[X11:%.*]] = zext i8 [[I11]] to i16
-; SLM-NEXT:    [[X12:%.*]] = zext i8 [[I12]] to i16
-; SLM-NEXT:    [[X13:%.*]] = zext i8 [[I13]] to i16
-; SLM-NEXT:    [[X14:%.*]] = zext i8 [[I14]] to i16
-; SLM-NEXT:    [[X15:%.*]] = zext i8 [[I15]] to i16
-; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
-; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
-; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
-; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
-; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
-; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
-; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
-; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
-; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
-; SLM-NEXT:    ret <16 x i16> [[V15]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; SLM-NEXT:    ret <16 x i16> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_16i8_to_16i16(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -588,39 +344,7 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; AVX-NEXT:    ret <16 x i16> [[V15]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -698,32 +422,21 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_2i16_to_2i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %i0 = load i16, i16* %p0, align 1
@@ -743,33 +456,16 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i16_to_4i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    ret <4 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -778,15 +474,7 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i32> [[V3]]
+; AVX-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -814,33 +502,16 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i16_to_4i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i16_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -849,15 +520,7 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -889,23 +552,7 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE2-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_8i16_to_8i32(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -915,31 +562,10 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
 ; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
 ; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
-; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
-; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
-; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
-; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
-; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
-; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
-; SLM-NEXT:    [[X4:%.*]] = zext i16 [[I4]] to i32
-; SLM-NEXT:    [[X5:%.*]] = zext i16 [[I5]] to i32
-; SLM-NEXT:    [[X6:%.*]] = zext i16 [[I6]] to i32
-; SLM-NEXT:    [[X7:%.*]] = zext i16 [[I7]] to i32
-; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
-; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; SLM-NEXT:    ret <8 x i32> [[V7]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_8i16_to_8i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -952,23 +578,7 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX-NEXT:    ret <8 x i32> [[V7]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -1014,32 +624,21 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
+; SSE2-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_2i32_to_2i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    ret <2 x i64> [[V1]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %i0 = load i32, i32* %p0, align 1
@@ -1059,33 +658,16 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
 ; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
+; SSE2-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; SLM-LABEL: @loadext_4i32_to_4i64(
 ; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
 ; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
 ; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SLM-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SLM-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SLM-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SLM-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SLM-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SLM-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; SLM-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @loadext_4i32_to_4i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -1094,15 +676,7 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
 ; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 ; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; AVX-NEXT:    ret <4 x i64> [[V3]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %p1 = getelementptr inbounds i32, i32* %p0, i64 1
   %p2 = getelementptr inbounds i32, i32* %p0, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll
index a6a1bbe4379f..740639a8ec55 100644
--- a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions-inseltpoison.ll
@@ -9,15 +9,7 @@ define <4 x float> @memread_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vmemread(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16

diff  --git a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll
index ab7dcc7f4578..51fce1134dc3 100644
--- a/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/vectorizable-functions.ll
@@ -9,15 +9,7 @@ define <4 x float> @memread_4x(<4 x float>* %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vmemread(<4 x float> [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, <4 x float>* %a, align 16


        


More information about the llvm-commits mailing list