[llvm] [SLP]Enable float point math ops as copyables elements. (PR #169857)

Thu Dec 4 07:50:48 PST 2025

https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/169857

>From 2cb37857117543a43442f58dd0519a4cbc8a9532 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Thu, 27 Nov 2025 13:44:18 -0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  49 ++++---
 .../AArch64/shuffle-vectors-mask-size.ll      |   6 +-
 .../X86/bv-root-part-of-graph.ll              |  11 +-
 .../SLPVectorizer/X86/crash_smallpt.ll        |  22 +--
 .../entry-no-bundle-but-extra-use-on-vec.ll   |  35 +++--
 .../extractelement-single-use-many-nodes.ll   |   3 +-
 .../X86/multi-node-for-copyable-parent.ll     |  15 +-
 .../X86/multi-node-user-with-copyable-ops.ll  |  19 +--
 .../non-commutative-op-in-commutative-inst.ll |  12 +-
 .../reused-last-instruction-in-split-node.ll  |  24 +--
 .../X86/user-with-multi-copyable-ops.ll       |  44 +++---
 .../X86/vect_copyable_in_binops.ll            | 128 ++++------------
 .../SLPVectorizer/alternate-non-profitable.ll |  11 +-
 .../SLPVectorizer/crash_exceed_scheduling.ll  | 138 ++++++++++++------
 .../extract-many-users-buildvector.ll         |  72 +++++----
 .../SLPVectorizer/insertelement-postpone.ll   |  40 ++---
 16 files changed, 288 insertions(+), 341 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3b36ccbd677dc..c3f8689e62d53 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5325,13 +5325,14 @@ class slpvectorizer::BoUpSLP {
       if (ScheduleCopyableDataMap.empty())
         return false;
       SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
-      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
+      ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
+      if (Entries.empty())
+        return false;
+      unsigned CurNumOps = 0;
       for (const Use &U : User->operands()) {
         if (U.get() != Op)
           continue;
-        ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
-        if (Entries.empty())
-          return false;
+        ++CurNumOps;
         // Check all tree entries, if they have operands replaced by copyable
         // data.
         for (TreeEntry *TE : Entries) {
@@ -5363,28 +5364,28 @@ class slpvectorizer::BoUpSLP {
           // reordered.
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
+          constexpr unsigned NumCommutativeOps = 2;
           bool IsCommutativeUser =
-              ::isCommutative(User) ||
-              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
-          if (!IsCommutativeUser && !isa<CmpInst>(User)) {
-            unsigned &OpCnt =
-                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
+              U.getOperandNo() < NumCommutativeOps &&
+              (::isCommutative(User) ||
+               ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User));
+          bool IsCommutativeWithSameOps =
+              IsCommutativeUser &&
+              User->getNumOperands() >= NumCommutativeOps &&
+              User->getOperand(0) == User->getOperand(1);
+          if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
+              !isa<CmpInst>(User)) {
             EdgeInfo EI(TE, U.getOperandNo());
-            if (!getScheduleCopyableData(EI, Op))
+            if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
               continue;
-            // Found copyable operand - continue.
-            OpCnt += Inc;
-            continue;
+            return false;
           }
           PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
               .first->getSecond() += Inc;
         }
       }
       if (PotentiallyReorderedEntriesCount.empty())
-        return all_of(OrderedEntriesCount,
-                      [&](const std::pair<const TreeEntry *, unsigned> &P) {
-                        return P.second == NumOps;
-                      });
+        return true;
       // Check the commutative/cmp entries.
       for (auto &P : PotentiallyReorderedEntriesCount) {
         SmallPtrSet<Value *, 4> ParentsUniqueUsers;
@@ -5430,10 +5431,6 @@ class slpvectorizer::BoUpSLP {
       return all_of(PotentiallyReorderedEntriesCount,
                     [&](const std::pair<const TreeEntry *, unsigned> &P) {
                       return P.second == NumOps - 1;
-                    }) &&
-             all_of(OrderedEntriesCount,
-                    [&](const std::pair<const TreeEntry *, unsigned> &P) {
-                      return P.second == NumOps;
                     });
     }
 
@@ -5655,6 +5652,7 @@ class slpvectorizer::BoUpSLP {
                 }
               };
 
+          SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
@@ -5662,7 +5660,6 @@ class slpvectorizer::BoUpSLP {
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
-            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
             bool IsNonSchedulableWithParentPhiNode =
                 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
                 Bundle->getTreeEntry()->UserTreeIndex &&
@@ -10865,7 +10862,9 @@ class InstructionsCompatibilityAnalysis {
            Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
            Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
            Opcode == Instruction::And || Opcode == Instruction::Or ||
-           Opcode == Instruction::Xor;
+           Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
+           Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
+           Opcode == Instruction::FDiv;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -11198,6 +11197,10 @@ class InstructionsCompatibilityAnalysis {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FSub:
+      case Instruction::FDiv:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 0783a28f56d85..961662c664a31 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -11,10 +11,10 @@ define void @p(double %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> <double 1.000000e+00, double 1.000000e+00, double poison, double poison>, <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
index 0cc4d3db5c537..1abc16da77c8e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
@@ -4,15 +4,16 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[FMUL:%.*]] = sitofp i32 0 to float
+; CHECK-NEXT:    [[SITOFP:%.*]] = sitofp i32 0 to float
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[SITOFP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 0, i32 poison, i32 7>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index d13a8578d1e00..c1cc3f2dfc9e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -7,36 +7,30 @@
 define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
 ; CHECK:       invoke.cont:
-; CHECK-NEXT:    br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
 ; CHECK:       arrayctor.cont:
 ; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
 ; CHECK:       for.cond36.preheader:
-; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
 ; CHECK:       cond.false51.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true48.us:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
 ; CHECK:       cond.false66.us:
-; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double poison, double 0xBFA5CC2D1960285F>, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.000000e-01>, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.400000e+02)
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr undef, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 2.000000e-01, double 3.000000e-01>, [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
+; CHECK-NEXT:    store <2 x double> <double 0x404900049667B5F2, double 0x404E0515D587DA7B>, ptr undef, align 8
+; CHECK-NEXT:    store <2 x double> <double 2.000000e-07, double 0x3F91A436DC4B6CE6>, ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
 ; CHECK-NEXT:    ret void
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body42.lr.ph.us:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
 ; CHECK:       _Z5clampd.exit.1:
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
 ;
@@ -96,7 +90,7 @@ _Z5clampd.exit.1:
 define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index 6d713e83bbf4e..ca65ff88a4b81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -9,33 +9,38 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP20]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP10]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP28]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x float> <float 1.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul <4 x float> <float -0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP31]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fadd <4 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP32]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 6942df532ae29..91ec61b275205 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -25,8 +25,7 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
index a07e617384e09..fd7f0c61b6737 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
@@ -6,14 +6,17 @@ define i1 @test(double %circ_radius, ptr %x) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP0]], 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[ADD20]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double poison, double poison>, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x double> <double -0.000000e+00, double 0.000000e+00, double 1.000000e+00, double -0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double poison, double poison, double 0.000000e+00, double poison>, <4 x i32> <i32 1, i32 2, i32 6, i32 0>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt <4 x double> [[TMP9]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
index eb3b183fd49eb..a9baedef3e509 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
@@ -6,17 +6,18 @@ define i1 @test(double %circ_radius, ptr %x, double %0) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]], double [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP1]], 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD20]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], <double -0.000000e+00, double 0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double 0.000000e+00, double 1.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP9]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP7]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x double> [[TMP13]], splat (double 1.000000e+00)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
index 8c684325f8c68..b71dbc49e7478 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
@@ -8,13 +8,11 @@ define void @test(ptr %quat, float %call13) {
 ; CHECK-SAME: ptr [[QUAT:%.*]], float [[CALL13:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL121:%.*]] = load volatile float, ptr null, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fmuladd.f32(float [[CALL13]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[CALL121]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 0.000000e+00
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[CALL121]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP2]], <float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> zeroinitializer, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[QUAT]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
index f101991648276..6dc9806da0aa9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
@@ -4,9 +4,7 @@
 define float @test() {
 ; CHECK-LABEL: define float @test() {
 ; CHECK-NEXT:  [[LABEL:.*]]:
-; CHECK-NEXT:    [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison>, float [[SUB_I102_I]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float 0.000000e+00, i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -14,26 +12,12 @@ define float @test() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> <float poison, float 1.000000e+00>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <8 x float> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <20 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    br label %[[REGION_30:.*]]
 ; CHECK:       [[REGION_30]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP10]], %[[LABEL]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7
 ; CHECK-NEXT:    ret float [[TMP27]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
index 7b298723d93b5..c58c63e51737c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
@@ -11,30 +11,23 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[ARRAYIDX_I1464:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX_I1464]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[THIS]], align 8
-; CHECK-NEXT:    [[DIV251:%.*]] = fmul double [[TMP1]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL257:%.*]] = fmul double [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL305:%.*]] = fmul double [[TMP4]], 0.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
-; CHECK-NEXT:    [[NEG356:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG356]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[THIS]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg double [[TMP3]]
 ; CHECK-NEXT:    [[NEG380:%.*]] = fmul double [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double [[MUL257]])
 ; CHECK-NEXT:    [[FNEG381:%.*]] = fneg double [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[DIV251]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FNEG381]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[NEG417:%.*]] = fneg double [[MUL257]]
-; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG417]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[FNEG418:%.*]] = fneg double [[TMP16]]
-; CHECK-NEXT:    [[MUL419:%.*]] = fmul double [[DIV251]], [[FNEG418]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[MUL257]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fneg <2 x double> [[TMP11]]
 ; CHECK-NEXT:    [[NEG436:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[FNEG437:%.*]] = fneg double [[TMP17]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> <double 1.000000e+00, double poison>, double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP14]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fneg double [[TMP4]]
 ; CHECK-NEXT:    [[NEG455:%.*]] = fmul double [[TMP1]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG455]], double 0.000000e+00, double [[MUL305]])
@@ -42,19 +35,18 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[FNEG474:%.*]] = fneg double [[TMP20]]
 ; CHECK-NEXT:    [[NEG492:%.*]] = fneg double [[MUL305]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG492]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> poison, double [[DIV251]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x double> poison, double [[FNEG437]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP19]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[FNEG474]], i32 2
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP28:%.*]] = fmul <4 x double> [[TMP23]], [[TMP27]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> <double 1.000000e+00, double 0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP23]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> poison, double [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP29]], double [[FNEG381]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[TMP10]], i32 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x double> [[TMP29]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[MUL419]], i32 3
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x double> [[TMP28]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP28]], double [[TMP19]], i32 5
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[FNEG474]], i32 6
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP33]], double [[TMP21]], i32 7
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP31]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = fptrunc <8 x double> [[TMP34]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP35]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 2a0e7889f0f34..d10d26671e76b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -336,32 +336,12 @@ entry:
 }
 
 define void @add1f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @add1f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @add1f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
-; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @add1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -387,18 +367,9 @@ entry:
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -565,18 +536,9 @@ entry:
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -631,32 +593,12 @@ entry:
 }
 
 define void @add1fn(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @add1fn(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @add1fn(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
-; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @add1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -682,18 +624,9 @@ entry:
 define void @sub0fn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -780,18 +713,9 @@ entry:
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index 125c2dce32663..b23da5fa263f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -52,11 +52,12 @@ define <2 x float> @replace_through_casts_and_binop(i16 %inp) {
 ; CHECK-SAME: i16 [[INP:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[INP]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i16 [[MUL]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 2.000000e+00
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp i16 [[ADD]] to float
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[TMP5]], <float 2.000000e+00, float -0.000000e+00>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %add = add nsw i16 %inp, -10
diff --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
index 793d089404d1e..c79969de6ac41 100644
--- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
@@ -1,52 +1,98 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @exceed(double %0, double %1) {
-; CHECK-LABEL: @exceed(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
-; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
-; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
-; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
-; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[LABEL:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    br label [[LABEL]]
-; CHECK:       label:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
-; CHECK-NEXT:    ret void
+; X86-LABEL: @exceed(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; X86-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; X86-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; X86-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; X86-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
+; X86-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; X86-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; X86-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
+; X86-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
+; X86-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
+; X86-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; X86-NEXT:      i32 0, label [[BB2:%.*]]
+; X86-NEXT:    ]
+; X86:       bb1:
+; X86-NEXT:    br label [[LABEL:%.*]]
+; X86:       bb2:
+; X86-NEXT:    br label [[LABEL]]
+; X86:       label:
+; X86-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @exceed(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; AARCH64-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; AARCH64-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; AARCH64-NEXT:    [[IX2:%.*]] = fmul double [[TMP7]], [[TMP7]]
+; AARCH64-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double poison, double 1.000000e+00>, <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = fdiv fast <2 x double> [[TMP9]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; AARCH64-NEXT:    [[IX:%.*]] = fmul double [[TMP12]], undef
+; AARCH64-NEXT:    [[IX1:%.*]] = fmul double [[TMP12]], undef
+; AARCH64-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP8]]
+; AARCH64-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], undef
+; AARCH64-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; AARCH64-NEXT:      i32 0, label [[BB2:%.*]]
+; AARCH64-NEXT:    ]
+; AARCH64:       bb1:
+; AARCH64-NEXT:    br label [[LABEL:%.*]]
+; AARCH64:       bb2:
+; AARCH64-NEXT:    br label [[LABEL]]
+; AARCH64:       label:
+; AARCH64-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP14]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; AARCH64-NEXT:    ret void
 ;
 entry:
   %i10 = fdiv fast double %0, %1
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 32e59697486a7..439943102b58a 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -7,56 +7,52 @@ define i1 @test(float %0, double %1) {
 ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; X86-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
-; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; X86-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
+; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
+; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; X86-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
+; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP6]], [[TMP12]]
 ; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
-; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
-; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
-; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
-; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
-; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
-; X86-NEXT:    ret i1 [[TMP25]]
+; X86-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP15]], [[TMP10]]
+; X86-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP15]], [[TMP10]]
+; X86-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
+; X86-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
+; X86-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
+; X86-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
+; X86-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
+; X86-NEXT:    ret i1 [[TMP23]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
-; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; AARCH64-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
-; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
+; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
+; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
 ; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP6]], [[TMP13]]
 ; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]]
-; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]]
-; AARCH64-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; AARCH64-NEXT:    [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float>
-; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer
-; AARCH64-NEXT:    [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer
-; AARCH64-NEXT:    [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]]
-; AARCH64-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]])
-; AARCH64-NEXT:    ret i1 [[TMP26]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <8 x double> [[TMP16]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <8 x double> [[TMP16]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float>
+; AARCH64-NEXT:    [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer
+; AARCH64-NEXT:    [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer
+; AARCH64-NEXT:    [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]]
+; AARCH64-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]])
+; AARCH64-NEXT:    ret i1 [[TMP24]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index eefc99feebb95..09e3ef41b3dbe 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -6,34 +6,34 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; X86-LABEL: @test(
 ; X86-NEXT:  entry:
 ; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
 ; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
 ; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; X86-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AARCH86-LABEL: @test(
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
 ; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
+; AARCH86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
+; AARCH86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; AARCH86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+; AARCH86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; AARCH86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
 ; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;