[llvm] a45688a - [SLP] Apply external to vectorizable tree users cost adjustment for

Fri May 29 15:39:00 PDT 2020

Author: Valery N Dmitriev
Date: 2020-05-29T15:37:41-07:00
New Revision: a45688a72c63b6359df7c23f2a2895f271d414e9

URL: https://github.com/llvm/llvm-project/commit/a45688a72c63b6359df7c23f2a2895f271d414e9
DIFF: https://github.com/llvm/llvm-project/commit/a45688a72c63b6359df7c23f2a2895f271d414e9.diff

LOG: [SLP] Apply external to vectorizable tree users cost adjustment for
relevant aggregate build instructions only (UserCost).
Users are detected with findBuildAggregate routine and the trick is
that following SLP vectorization may end up vectorizing entire list
with smaller chunks. Cost adjustment then is applied for individual
chunks and these adjustments obviously have to be smaller than the
entire aggregate build cost.

Differential Revision: https://reviews.llvm.org/D80773

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
    llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
    llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
    llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
    llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 8a01114745f3..77236dec75dc 100644

--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -93,11 +93,15 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
   /// Try to vectorize a list of operands.
-  /// \param UserCost Cost of the user operations of \p VL if they may affect
-  /// the cost of the vectorization.
+  /// When \p InsertUses is provided and its entries are non-zero
+  /// then users of \p VL are known to be InsertElement instructions
+  /// each associated with same VL entry index. Their cost is then
+  /// used to adjust cost of the vectorization assuming instcombine pass
+  /// then optimizes ExtractElement-InsertElement sequence.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
-                          int UserCost = 0, bool AllowReorder = false);
+                          bool AllowReorder = false,
+                          ArrayRef<Value *> InsertUses = None);
 
   /// Try to vectorize a chain that may start at the operands of \p I.
   bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4c18fab4ec09..218b7fc83e42 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5929,12 +5929,13 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
-  Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
+  Value *VL[] = {A, B};
+  return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           int UserCost, bool AllowReorder) {
+                                           bool AllowReorder,
+                                           ArrayRef<Value *> InsertUses) {
   if (VL.size() < 2)
     return false;
 
@@ -5983,6 +5984,13 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   bool CandidateFound = false;
   int MinCost = SLPCostThreshold;
 
+  bool CompensateUseCost =
+      !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
+        return V && isa<InsertElementInst>(V);
+      });
+  assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
+         "Each scalar expected to have an associated InsertElement user.");
+
   unsigned NextInst = 0, MaxInst = VL.size();
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
@@ -6030,8 +6038,48 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         continue;
 
       R.computeMinimumValueSizes();
-      int Cost = R.getTreeCost() - UserCost;
+      int Cost = R.getTreeCost();
       CandidateFound = true;
+      if (CompensateUseCost) {
+        // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
+        // rather than sum of single inserts as the latter may overestimate
+        // cost. This work should imply improving cost estimation for extracts
+        // that added in for external (for vectorization tree) users,i.e. that
+        // part should also switch to same interface.
+        // For example, the following case is projected code after SLP:
+        //  %4 = extractelement <4 x i64> %3, i32 0
+        //  %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+        //  %5 = extractelement <4 x i64> %3, i32 1
+        //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+        //  %6 = extractelement <4 x i64> %3, i32 2
+        //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+        //  %7 = extractelement <4 x i64> %3, i32 3
+        //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+        //
+        // Extracts here added by SLP in order to feed users (the inserts) of
+        // original scalars and contribute to "ExtractCost" at cost evaluation.
+        // The inserts in turn form sequence to build an aggregate that
+        // detected by findBuildAggregate routine.
+        // SLP makes an assumption that such sequence will be optimized away
+        // later (instcombine) so it tries to compensate ExctractCost with
+        // cost of insert sequence.
+        // Current per element cost calculation approach is not quite accurate
+        // and tends to create bias toward favoring vectorization.
+        // Switching to the TTI interface might help a bit.
+        // Alternative solution could be pattern-match to detect a no-op or
+        // shuffle.
+        unsigned UserCost = 0;
+        for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
+          auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
+          if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+            UserCost += TTI->getVectorInstrCost(
+                Instruction::InsertElement, IE->getType(), CI->getZExtValue());
+        }
+        LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
+                          << ".\n");
+        Cost -= UserCost;
+      }
+
       MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
@@ -7047,48 +7095,16 @@ class HorizontalReduction {
 /// \return true if it matches.
 static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
                                SmallVectorImpl<Value *> &BuildVectorOpds,
-                               int &UserCost) {
+                               SmallVectorImpl<Value *> &InsertElts) {
   assert((isa<InsertElementInst>(LastInsertInst) ||
           isa<InsertValueInst>(LastInsertInst)) &&
          "Expected insertelement or insertvalue instruction!");
-  UserCost = 0;
   do {
-    // TODO: Use TTI's getScalarizationOverhead for sequence of inserts rather
-    // than sum of single inserts as the latter may overestimate cost.
-    // This work should imply improving cost estimation for extracts that
-    // added in for external (for vectorization tree) users.
-    // For example, in following case all extracts added in order to feed
-    // into external users (inserts), which in turn form sequence to build
-    // an aggregate that we do match here:
-    //  %4 = extractelement <4 x i64> %3, i32 0
-    //  %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
-    //  %5 = extractelement <4 x i64> %3, i32 1
-    //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
-    //  %6 = extractelement <4 x i64> %3, i32 2
-    //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
-    //  %7 = extractelement <4 x i64> %3, i32 3
-    //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
-    //
-    // Cost of this entire sequence is currently estimated as sum of single
-    // extracts (as this aggregate build sequence is an external to
-    // vectorization tree user) minus cost of the aggregate build.
-    // As this whole sequence will be optimized away we want the cost to be
-    // zero. But it is not quite possible using given approach (at least for
-    // X86) because inserts can be more expensive than extracts for longer
-    // vector lengths so the 
diff erence turns out not zero in such a case.
-    // Ideally we want to match this entire sequence and treat it as a no-op
-    // (i.e. do not count into final cost at all).
-    // Currently the 
diff erence tends to be negative thus adding a bias
-    // toward favoring vectorization. If we switch into using TTI interface
-    // the bias tendency will remain but will be lower.
     Value *InsertedOperand;
-    if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+    auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
+    if (IE) {
       InsertedOperand = IE->getOperand(1);
       LastInsertInst = IE->getOperand(0);
-      if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
-        UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
-                                            IE->getType(), CI->getZExtValue());
-      }
     } else {
       auto *IV = cast<InsertValueInst>(LastInsertInst);
       InsertedOperand = IV->getInsertedValueOperand();
@@ -7096,16 +7112,17 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
     }
     if (isa<InsertElementInst>(InsertedOperand) ||
         isa<InsertValueInst>(InsertedOperand)) {
-      int TmpUserCost;
       SmallVector<Value *, 8> TmpBuildVectorOpds;
+      SmallVector<Value *, 8> TmpInsertElts;
       if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
-                              TmpUserCost))
+                              TmpInsertElts))
         return false;
       BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
                              TmpBuildVectorOpds.rend());
-      UserCost += TmpUserCost;
+      InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
     } else {
       BuildVectorOpds.push_back(InsertedOperand);
+      InsertElts.push_back(IE);
     }
     if (isa<UndefValue>(LastInsertInst))
       break;
@@ -7115,6 +7132,7 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
       return false;
   } while (true);
   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+  std::reverse(InsertElts.begin(), InsertElts.end());
   return true;
 }
 
@@ -7279,26 +7297,29 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
                                                  BasicBlock *BB, BoUpSLP &R) {
-  int UserCost = 0;
   const DataLayout &DL = BB->getModule()->getDataLayout();
   if (!R.canMapToVector(IVI->getType(), DL))
     return false;
 
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
+  SmallVector<Value *, 16> BuildVectorInsts;
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+      BuildVectorOpds.size() < 2)
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
-  int UserCost;
+  SmallVector<Value *, 16> BuildVectorInsts;
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
+  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+      BuildVectorOpds.size() < 2 ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
        isShuffle(BuildVectorOpds)))
@@ -7306,7 +7327,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
 
   // Vectorize starting with the build vector operands ignoring the BuildVector
   // instructions for the purpose of scheduling and user extraction.
-  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
 }
 
 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -7384,8 +7406,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // is done when there are exactly two elements since tryToVectorizeList
       // asserts that there are only two values when AllowReorder is true.
       bool AllowReorder = NumElts == 2;
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
-                                            /*UserCost=*/0, AllowReorder)) {
+      if (NumElts > 1 &&
+          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 707d2a216a1a..6d2c2f410ade 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -153,15 +153,12 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
 ; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0
+; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
 ; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
 ; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
@@ -187,21 +184,25 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0
+; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
 ;
   %v0.0 = extractelement <2 x i32> %v0, i32 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
index de7c59286ac3..e1cad3e14014 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -56,20 +56,38 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> <i32 4, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 undef, i32 undef, i32 undef>
-; SLM-NEXT:    [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 9, i32 10, i32 undef>
-; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x float> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x float> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], [[B0]]
+; SLM-NEXT:    [[AB1:%.*]] = fdiv float [[A1]], [[B1]]
+; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], [[B2]]
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], [[B3]]
+; SLM-NEXT:    [[AB4:%.*]] = fmul float [[A4]], [[B4]]
+; SLM-NEXT:    [[AB5:%.*]] = fdiv float [[A5]], [[B5]]
+; SLM-NEXT:    [[AB6:%.*]] = fdiv float [[A6]], [[B6]]
+; SLM-NEXT:    [[AB7:%.*]] = fmul float [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x float> [[R7]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v8f32(

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 085b7137f2b2..0324f35330c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -101,11 +101,44 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @ashr_shl_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; SSE-LABEL: @ashr_shl_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_shl_v8i32(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; AVX1-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; AVX1-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
+; AVX1-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_shl_v8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index ae79a7eb0e80..de686fe0e18c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -179,13 +179,22 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fadd double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
 ; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index c1c2ac544b15..c71e0da97336 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -179,13 +179,22 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = fsub double [[A2]], [[A3]]
+; SLM-NEXT:    [[R3:%.*]] = fsub double [[B2]], [[B3]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2
+; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3
 ; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
 ; AVX-LABEL: @test_v4f64(

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 524a1c51389e..50eeead886fe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -111,15 +111,13 @@ define <4 x float> @PR16739_byval(<4 x float>* nocapture readonly dereferenceabl
 ; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
 ; CHECK-NEXT:    [[T3:%.*]] = bitcast float* [[T2]] to i64*
 ; CHECK-NEXT:    [[T4:%.*]] = load i64, i64* [[T3]], align 8
+; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
+; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
+; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[T8]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP6]], i32 1
+; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
+; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
+; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
 ; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
 ; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
 ; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
index 9ceea2b81ac9..532bf693c631 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -6,8 +6,6 @@
 ; Vectorization triggered by cost bias caused by subtracting
 ; the cost of entire "aggregate build" sequence while
 ; building vectorizable tree from only a portion of it.
-; FIXME: this is unprofitable to vectorize.
-
 
 define void @test(i32* nocapture %t2) {
 ; CHECK-LABEL: @test(
@@ -29,9 +27,11 @@ define void @test(i32* nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
+; CHECK-NEXT:    [[T28:%.*]] = add nsw i32 [[T15]], [[T9]]
 ; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -41,29 +41,16 @@ define void @test(i32* nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 6270, i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[TMP14]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[TMP15]], i32 3
-; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP12]], i32 4
-; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5
+; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
+; CHECK-NEXT:    [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
+; CHECK-NEXT:    [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0
+; CHECK-NEXT:    [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
+; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
+; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
+; CHECK-NEXT:    [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4
+; CHECK-NEXT:    [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP15]], i32 7
+; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4