[llvm] c244168 - Revert "[SLP]Enable float point math ops as copyables elements."

Fri Dec 26 06:56:05 PST 2025

Author: Alexey Bataev
Date: 2025-12-26T06:55:32-08:00
New Revision: c2441689830fcb2588673dedba98da1219a2fb9e

URL: https://github.com/llvm/llvm-project/commit/c2441689830fcb2588673dedba98da1219a2fb9e
DIFF: https://github.com/llvm/llvm-project/commit/c2441689830fcb2588673dedba98da1219a2fb9e.diff

LOG: Revert "[SLP]Enable float point math ops as copyables elements."

This reverts commit 48be4d07c3ca045fe831cbdf216631202c55cd62
to investigate crashes reported in https://github.com/llvm/llvm-project/commit/2568ec6cb29da3db5bd7c848ec53a673c1431aea#commitcomment-173523022.

Added: 
    

Modified: 
    llvm/include/llvm/IR/Instruction.h
    llvm/include/llvm/IR/IntrinsicInst.h
    llvm/lib/IR/Instruction.cpp
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
    llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
    llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
    llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
    llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
    llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
    llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
    llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
    llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
    llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
    llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
    llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
    llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
    llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
    llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
    llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 11385666e7ff8..2eb4fd36c5b7d 100644

--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -762,12 +762,6 @@ class Instruction : public User,
   /// applied to any type.
   ///
   LLVM_ABI bool isCommutative() const LLVM_READONLY;
-
-  /// Checks if the operand is commutative. In commutative operations, not all
-  /// operands might commutable, e.g. for fmuladd only 2 first operands are
-  /// commutable.
-  LLVM_ABI bool isCommutableOperand(unsigned Op) const LLVM_READONLY;
-
   static bool isCommutative(unsigned Opcode) {
     switch (Opcode) {
     case Add: case FAdd:

diff  --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 0b25baa465a71..0622bfae2c845 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -101,12 +101,6 @@ class IntrinsicInst : public CallInst {
     }
   }
 
-  /// Return true if the operand is commutable.
-  bool isCommutableOperand(unsigned Op) const {
-    constexpr unsigned NumCommutativeOps = 2;
-    return isCommutative() && Op < NumCommutativeOps;
-  }
-
   /// Checks if the intrinsic is an annotation.
   bool isAssumeLikeIntrinsic() const {
     switch (getIntrinsicID()) {

diff  --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 7682c28e23b33..f3d4d2424fe5b 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1293,13 +1293,6 @@ bool Instruction::isCommutative() const {
   return isCommutative(getOpcode());
 }
 
-bool Instruction::isCommutableOperand(unsigned Op) const {
-  if (auto *II = dyn_cast<IntrinsicInst>(this))
-    return II->isCommutableOperand(Op);
-  // TODO: Should allow icmp/fcmp?
-  return isCommutative(getOpcode());
-}
-
 unsigned Instruction::getNumSuccessors() const {
   switch (getOpcode()) {
 #define HANDLE_TERM_INST(N, OPC, CLASS)                                        \

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c700fef5ecd8f..4af20f0e1838b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -575,27 +575,6 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
   return I->isCommutative();
 }
 
-/// Checks if the operand is commutative. In commutative operations, not all
-/// operands might commutable, e.g. for fmuladd only 2 first operands are
-/// commutable.
-static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
-                                bool IsCopyable = false) {
-  assert(::isCommutative(I, ValWithUses, IsCopyable) &&
-         "The instruction is not commutative.");
-  if (isa<CmpInst>(I))
-    return true;
-  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
-    switch (BO->getOpcode()) {
-    case Instruction::Sub:
-    case Instruction::FSub:
-      return true;
-    default:
-      break;
-    }
-  }
-  return I->isCommutableOperand(Op);
-}
-
 /// This is a helper function to check whether \p I is commutative.
 /// This is a convenience wrapper that calls the two-parameter version of
 /// isCommutative with the same instruction for both parameters. This is
@@ -5349,14 +5328,13 @@ class slpvectorizer::BoUpSLP {
       if (ScheduleCopyableDataMap.empty())
         return false;
       SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
+      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
       ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
       if (Entries.empty())
         return false;
-      unsigned CurNumOps = 0;
       for (const Use &U : User->operands()) {
         if (U.get() != Op)
           continue;
-        ++CurNumOps;
         // Check all tree entries, if they have operands replaced by copyable
         // data.
         for (TreeEntry *TE : Entries) {
@@ -5389,43 +5367,27 @@ class slpvectorizer::BoUpSLP {
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
           bool IsCommutativeUser =
-              ::isCommutative(User) &&
-              ::isCommutableOperand(User, User, U.getOperandNo());
-          if (!IsCommutativeUser) {
-            Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
-            IsCommutativeUser =
-                ::isCommutative(MainOp, User) &&
-                ::isCommutableOperand(MainOp, User, U.getOperandNo());
-          }
-          // The commutative user with the same operands can be safely
-          // considered as non-commutative, operands reordering does not change
-          // the semantics.
-          assert(
-              (!IsCommutativeUser ||
-               (((::isCommutative(User) &&
-                  ::isCommutableOperand(User, User, 0) &&
-                  ::isCommutableOperand(User, User, 1)) ||
-                 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
-                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
-                                        User, 0) &&
-                  ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
-                                        User, 1))))) &&
-              "Expected commutative user with 2 first commutable operands");
-          bool IsCommutativeWithSameOps =
-              IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
-          if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
-              !isa<CmpInst>(User)) {
+              ::isCommutative(User) ||
+              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
+          if (!IsCommutativeUser && !isa<CmpInst>(User)) {
+            unsigned &OpCnt =
+                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
             EdgeInfo EI(TE, U.getOperandNo());
-            if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
+            if (!getScheduleCopyableData(EI, Op))
               continue;
-            return false;
+            // Found copyable operand - continue.
+            OpCnt += Inc;
+            continue;
           }
           PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
               .first->getSecond() += Inc;
         }
       }
       if (PotentiallyReorderedEntriesCount.empty())
-        return true;
+        return all_of(OrderedEntriesCount,
+                      [&](const std::pair<const TreeEntry *, unsigned> &P) {
+                        return P.second == NumOps;
+                      });
       // Check the commutative/cmp entries.
       for (auto &P : PotentiallyReorderedEntriesCount) {
         SmallPtrSet<Value *, 4> ParentsUniqueUsers;
@@ -5471,6 +5433,10 @@ class slpvectorizer::BoUpSLP {
       return all_of(PotentiallyReorderedEntriesCount,
                     [&](const std::pair<const TreeEntry *, unsigned> &P) {
                       return P.second == NumOps - 1;
+                    }) &&
+             all_of(OrderedEntriesCount,
+                    [&](const std::pair<const TreeEntry *, unsigned> &P) {
+                      return P.second == NumOps;
                     });
     }
 
@@ -5681,22 +5647,17 @@ class slpvectorizer::BoUpSLP {
                 auto It = OperandsUses.find(I);
                 assert(It != OperandsUses.end() && "Operand not found");
                 if (It->second > 0) {
+                  --It->getSecond();
+                  assert(TotalOpCount > 0 && "No more operands to decrement");
+                  --TotalOpCount;
                   if (ScheduleData *OpSD = getScheduleData(I)) {
                     if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
                       return;
-                    --It->getSecond();
-                    assert(TotalOpCount > 0 && "No more operands to decrement");
-                    --TotalOpCount;
                     DecrUnsched(OpSD, /*IsControl=*/false);
-                  } else {
-                    --It->getSecond();
-                    assert(TotalOpCount > 0 && "No more operands to decrement");
-                    --TotalOpCount;
                   }
                 }
               };
 
-          SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
@@ -5704,6 +5665,7 @@ class slpvectorizer::BoUpSLP {
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
+            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
             bool IsNonSchedulableWithParentPhiNode =
                 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
                 Bundle->getTreeEntry()->UserTreeIndex &&
@@ -10913,9 +10875,7 @@ class InstructionsCompatibilityAnalysis {
            Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
            Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
            Opcode == Instruction::And || Opcode == Instruction::Or ||
-           Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
-           Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
-           Opcode == Instruction::FDiv;
+           Opcode == Instruction::Xor;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -11256,10 +11216,6 @@ class InstructionsCompatibilityAnalysis {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
-      case Instruction::FAdd:
-      case Instruction::FMul:
-      case Instruction::FSub:
-      case Instruction::FDiv:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 961662c664a31..0783a28f56d85 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -11,10 +11,10 @@ define void @p(double %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> <double 1.000000e+00, double 1.000000e+00, double poison, double poison>, <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
index 1abc16da77c8e..0cc4d3db5c537 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
@@ -4,16 +4,15 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[FMUL:%.*]] = sitofp i32 0 to float
-; CHECK-NEXT:    [[SITOFP:%.*]] = sitofp i32 0 to float
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[SITOFP]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 0, i32 poison, i32 7>
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index c1cc3f2dfc9e5..d13a8578d1e00 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -7,30 +7,36 @@
 define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
 ; CHECK:       invoke.cont:
-; CHECK-NEXT:    br i1 [[ARG]], label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK-NEXT:    br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
 ; CHECK:       arrayctor.cont:
 ; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
 ; CHECK:       for.cond36.preheader:
-; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
 ; CHECK:       cond.false51.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true48.us:
-; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
 ; CHECK:       cond.false66.us:
-; CHECK-NEXT:    store <2 x double> <double 0x404900049667B5F2, double 0x404E0515D587DA7B>, ptr undef, align 8
-; CHECK-NEXT:    store <2 x double> <double 2.000000e-07, double 0x3F91A436DC4B6CE6>, ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
+; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double poison, double 0xBFA5CC2D1960285F>, double [[ADD_I276_US]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.000000e-01>, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.400000e+02)
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr undef, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 2.000000e-01, double 3.000000e-01>, [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
 ; CHECK-NEXT:    ret void
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body42.lr.ph.us:
-; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
 ; CHECK:       _Z5clampd.exit.1:
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
 ;
@@ -90,7 +96,7 @@ _Z5clampd.exit.1:
 define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK-NEXT:    br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index ca65ff88a4b81..6d713e83bbf4e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -9,38 +9,33 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP20]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP10]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP28]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x float> <float 1.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP32:%.*]] = fmul <4 x float> <float -0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP31]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd <4 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP32]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 91ec61b275205..6942df532ae29 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -25,7 +25,8 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
index fd7f0c61b6737..a07e617384e09 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
@@ -6,17 +6,14 @@ define i1 @test(double %circ_radius, ptr %x) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP0]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double poison, double poison>, double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x double> <double -0.000000e+00, double 0.000000e+00, double 1.000000e+00, double -0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[ADD20]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double poison, double poison, double 0.000000e+00, double poison>, <4 x i32> <i32 1, i32 2, i32 6, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt <4 x double> [[TMP9]], splat (double 1.000000e+00)

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
index a9baedef3e509..eb3b183fd49eb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
@@ -6,18 +6,17 @@ define i1 @test(double %circ_radius, ptr %x, double %0) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]], double [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 1
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], <double -0.000000e+00, double 0.000000e+00, double -0.000000e+00, double -0.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD20]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double 0.000000e+00, double 1.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP9]], [[TMP17]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP7]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x double> [[TMP13]], splat (double 1.000000e+00)

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
index b71dbc49e7478..8c684325f8c68 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
@@ -8,11 +8,13 @@ define void @test(ptr %quat, float %call13) {
 ; CHECK-SAME: ptr [[QUAT:%.*]], float [[CALL13:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL121:%.*]] = load volatile float, ptr null, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[CALL121]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP2]], <float 0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fmuladd.f32(float [[CALL13]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[CALL121]], float 0.000000e+00, float 0.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> zeroinitializer, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[QUAT]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll b/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
index a84c6ae6b0980..ba52ef4c462a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/propagate-mmra.ll
@@ -5,9 +5,9 @@ define void @foo() {
 ; CHECK-LABEL: define void @foo() {
 ; CHECK-NEXT:  [[_PREHEADER16_PREHEADER:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4, !mmra [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 1.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP1]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP4]], <2 x float> zeroinitializer
 ; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr null, align 16

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
index 6dc9806da0aa9..f101991648276 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
@@ -4,7 +4,9 @@
 define float @test() {
 ; CHECK-LABEL: define float @test() {
 ; CHECK-NEXT:  [[LABEL:.*]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float 0.000000e+00, i32 0
+; CHECK-NEXT:    [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison>, float [[SUB_I102_I]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -12,12 +14,26 @@ define float @test() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> <float poison, float 1.000000e+00>, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <8 x float> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <20 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    br label %[[REGION_30:.*]]
 ; CHECK:       [[REGION_30]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP10]], %[[LABEL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7
 ; CHECK-NEXT:    ret float [[TMP27]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
index f1031937180a3..3645ad89af624 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-operands-but-copyable.ll
@@ -7,10 +7,10 @@ define void @test(ptr %0, ptr %1, float %.sroa.3232.0.copyload) {
 ; CHECK-NEXT:  [[BB:.*:]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 12
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP3]], <float 0.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP3]], float [[DOTSROA_3232_0_COPYLOAD]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP5]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x float> [[TMP5]], [[TMP9]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
index c58c63e51737c..7b298723d93b5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
@@ -11,23 +11,30 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[ARRAYIDX_I1464:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX_I1464]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[THIS]], align 8
+; CHECK-NEXT:    [[DIV251:%.*]] = fmul double [[TMP1]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL257:%.*]] = fmul double [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL305:%.*]] = fmul double [[TMP4]], 0.000000e+00
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
+; CHECK-NEXT:    [[NEG356:%.*]] = fmul double [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG356]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[THIS]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg double [[TMP3]]
 ; CHECK-NEXT:    [[NEG380:%.*]] = fmul double [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double [[MUL257]])
 ; CHECK-NEXT:    [[FNEG381:%.*]] = fneg double [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[MUL257]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = fneg <2 x double> [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[DIV251]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FNEG381]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[NEG417:%.*]] = fneg double [[MUL257]]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG417]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[FNEG418:%.*]] = fneg double [[TMP16]]
+; CHECK-NEXT:    [[MUL419:%.*]] = fmul double [[DIV251]], [[FNEG418]]
 ; CHECK-NEXT:    [[NEG436:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> <double 1.000000e+00, double poison>, double [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP14]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[TMP17]]
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[FNEG437:%.*]] = fneg double [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fneg double [[TMP4]]
 ; CHECK-NEXT:    [[NEG455:%.*]] = fmul double [[TMP1]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG455]], double 0.000000e+00, double [[MUL305]])
@@ -35,18 +42,19 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[FNEG474:%.*]] = fneg double [[TMP20]]
 ; CHECK-NEXT:    [[NEG492:%.*]] = fneg double [[MUL305]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG492]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> <double 1.000000e+00, double 0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP23]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> poison, double [[DIV251]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x double> poison, double [[FNEG437]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[FNEG474]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP21]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = fmul <4 x double> [[TMP23]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> poison, double [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP29]], double [[FNEG381]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[TMP10]], i32 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP28]], double [[TMP19]], i32 5
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[FNEG474]], i32 6
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP33]], double [[TMP21]], i32 7
-; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP31]], [[TMP22]]
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x double> [[TMP29]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[MUL419]], i32 3
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x double> [[TMP28]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP35:%.*]] = fptrunc <8 x double> [[TMP34]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP35]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index d10d26671e76b..2a0e7889f0f34 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -336,12 +336,32 @@ entry:
 }
 
 define void @add1f(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1f(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @add1f(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @add1f(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -367,9 +387,18 @@ entry:
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -536,9 +565,18 @@ entry:
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -593,12 +631,32 @@ entry:
 }
 
 define void @add1fn(ptr noalias %dst, ptr noalias %src) {
-; CHECK-LABEL: @add1fn(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
-; CHECK-NEXT:    ret void
+; NON-POW2-LABEL: @add1fn(
+; NON-POW2-NEXT:  entry:
+; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; NON-POW2-NEXT:    ret void
+;
+; POW2-ONLY-LABEL: @add1fn(
+; POW2-ONLY-NEXT:  entry:
+; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
+; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
+; POW2-ONLY-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -624,9 +682,18 @@ entry:
 define void @sub0fn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -713,9 +780,18 @@ entry:
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index b23da5fa263f6..125c2dce32663 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -52,12 +52,11 @@ define <2 x float> @replace_through_casts_and_binop(i16 %inp) {
 ; CHECK-SAME: i16 [[INP:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[INP]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[MUL]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[TMP5]], <float 2.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i16 [[MUL]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 2.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = sitofp i16 [[ADD]] to float
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i64 1
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %add = add nsw i16 %inp, -10

diff  --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
index c79969de6ac41..793d089404d1e 100644
--- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
@@ -1,98 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
 
 define void @exceed(double %0, double %1) {
-; X86-LABEL: @exceed(
-; X86-NEXT:  entry:
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; X86-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; X86-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; X86-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; X86-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
-; X86-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
-; X86-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; X86-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; X86-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
-; X86-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
-; X86-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; X86-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
-; X86-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
-; X86-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; X86-NEXT:      i32 0, label [[BB2:%.*]]
-; X86-NEXT:    ]
-; X86:       bb1:
-; X86-NEXT:    br label [[LABEL:%.*]]
-; X86:       bb2:
-; X86-NEXT:    br label [[LABEL]]
-; X86:       label:
-; X86-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
-; X86-NEXT:    ret void
-;
-; AARCH64-LABEL: @exceed(
-; AARCH64-NEXT:  entry:
-; AARCH64-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; AARCH64-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; AARCH64-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; AARCH64-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; AARCH64-NEXT:    [[IX2:%.*]] = fmul double [[TMP7]], [[TMP7]]
-; AARCH64-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double poison, double 1.000000e+00>, <2 x i32> <i32 0, i32 3>
-; AARCH64-NEXT:    [[TMP11:%.*]] = fdiv fast <2 x double> [[TMP9]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; AARCH64-NEXT:    [[IX:%.*]] = fmul double [[TMP12]], undef
-; AARCH64-NEXT:    [[IX1:%.*]] = fmul double [[TMP12]], undef
-; AARCH64-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP8]]
-; AARCH64-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; AARCH64-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], undef
-; AARCH64-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; AARCH64-NEXT:      i32 0, label [[BB2:%.*]]
-; AARCH64-NEXT:    ]
-; AARCH64:       bb1:
-; AARCH64-NEXT:    br label [[LABEL:%.*]]
-; AARCH64:       bb2:
-; AARCH64-NEXT:    br label [[LABEL]]
-; AARCH64:       label:
-; AARCH64-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP14]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
-; AARCH64-NEXT:    ret void
+; CHECK-LABEL: @exceed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
+; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
+; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[LABEL:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[LABEL]]
+; CHECK:       label:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
+; CHECK-NEXT:    ret void
 ;
 entry:
   %i10 = fdiv fast double %0, %1

diff  --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 439943102b58a..32e59697486a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -7,52 +7,56 @@ define i1 @test(float %0, double %1) {
 ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; X86-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
-; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
-; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
-; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
+; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; X86-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP6]], [[TMP12]]
+; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
 ; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP15]], [[TMP10]]
-; X86-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP15]], [[TMP10]]
-; X86-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
-; X86-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
-; X86-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
-; X86-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
-; X86-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
-; X86-NEXT:    ret i1 [[TMP23]]
+; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
+; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
+; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
+; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
+; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
+; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
+; X86-NEXT:    ret i1 [[TMP25]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
-; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
-; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
-; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
-; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
+; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
+; AARCH64-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
 ; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP6]], [[TMP13]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]]
 ; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <8 x double> [[TMP16]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <8 x double> [[TMP16]], [[TMP10]]
-; AARCH64-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; AARCH64-NEXT:    [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float>
-; AARCH64-NEXT:    [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer
-; AARCH64-NEXT:    [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer
-; AARCH64-NEXT:    [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]]
-; AARCH64-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]])
-; AARCH64-NEXT:    ret i1 [[TMP24]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]]
+; AARCH64-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float>
+; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer
+; AARCH64-NEXT:    [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer
+; AARCH64-NEXT:    [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]]
+; AARCH64-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]])
+; AARCH64-NEXT:    ret i1 [[TMP26]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double

diff  --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index 09e3ef41b3dbe..eefc99feebb95 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -6,34 +6,34 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; X86-LABEL: @test(
 ; X86-NEXT:  entry:
 ; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
-; X86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
-; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
 ; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
-; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
 ; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; X86-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AARCH86-LABEL: @test(
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; AARCH86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
-; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
 ; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
-; AARCH86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
-; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
-; AARCH86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
-; AARCH86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
-; AARCH86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
-; AARCH86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
 ; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;