[llvm] e644f06 - [SLP]Enable float point math ops as copyables elements.

Sun Dec 21 11:45:52 PST 2025

Author: Alexey Bataev
Date: 2025-12-21T14:45:48-05:00
New Revision: e644f06c2ffc23b3415f3478b05c627303aef614

URL: https://github.com/llvm/llvm-project/commit/e644f06c2ffc23b3415f3478b05c627303aef614
DIFF: https://github.com/llvm/llvm-project/commit/e644f06c2ffc23b3415f3478b05c627303aef614.diff

LOG: [SLP]Enable float point math ops as copyables elements.

Patch enables support for float point math operations as base
instructions for copyable elements. It also fixes some scheduling
issues, found during testing

Reviewers: hiraditya, RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/169857

Added: 
    

Modified: 
    llvm/include/llvm/IR/Instruction.h
    llvm/include/llvm/IR/IntrinsicInst.h
    llvm/lib/IR/Instruction.cpp
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
    llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
    llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
    llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
    llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
    llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
    llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
    llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
    llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
    llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
    llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
    llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
    llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
    llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 2eb4fd36c5b7d..11385666e7ff8 100644

--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -762,6 +762,12 @@ class Instruction : public User,
   /// applied to any type.
   ///
   LLVM_ABI bool isCommutative() const LLVM_READONLY;
+
+  /// Checks if the operand is commutative. In commutative operations, not all
+  /// operands might commutable, e.g. for fmuladd only 2 first operands are
+  /// commutable.
+  LLVM_ABI bool isCommutableOperand(unsigned Op) const LLVM_READONLY;
+
   static bool isCommutative(unsigned Opcode) {
     switch (Opcode) {
     case Add: case FAdd:

diff  --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 0622bfae2c845..0b25baa465a71 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -101,6 +101,12 @@ class IntrinsicInst : public CallInst {
     }
   }
 
+  /// Return true if the operand is commutable.
+  bool isCommutableOperand(unsigned Op) const {
+    constexpr unsigned NumCommutativeOps = 2;
+    return isCommutative() && Op < NumCommutativeOps;
+  }
+
   /// Checks if the intrinsic is an annotation.
   bool isAssumeLikeIntrinsic() const {
     switch (getIntrinsicID()) {

diff  --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index f3d4d2424fe5b..7682c28e23b33 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1293,6 +1293,13 @@ bool Instruction::isCommutative() const {
   return isCommutative(getOpcode());
 }
 
+bool Instruction::isCommutableOperand(unsigned Op) const {
+  if (auto *II = dyn_cast<IntrinsicInst>(this))
+    return II->isCommutableOperand(Op);
+  // TODO: Should allow icmp/fcmp?
+  return isCommutative(getOpcode());
+}
+
 unsigned Instruction::getNumSuccessors() const {
   switch (getOpcode()) {
 #define HANDLE_TERM_INST(N, OPC, CLASS)                                        \

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b8e09713c489f..38e1c3bb44106 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -573,6 +573,27 @@ static bool isCommutative(Instruction *I, Value *ValWithUses,
   return I->isCommutative();
 }
 
+/// Checks if the operand is commutative. In commutative operations, not all
+/// operands might commutable, e.g. for fmuladd only 2 first operands are
+/// commutable.
+static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
+                                bool IsCopyable = false) {
+  assert(::isCommutative(I, ValWithUses, IsCopyable) &&
+         "The instruction is not commutative.");
+  if (isa<CmpInst>(I))
+    return true;
+  if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+    switch (BO->getOpcode()) {
+    case Instruction::Sub:
+    case Instruction::FSub:
+      return true;
+    default:
+      break;
+    }
+  }
+  return I->isCommutableOperand(Op);
+}
+
 /// This is a helper function to check whether \p I is commutative.
 /// This is a convenience wrapper that calls the two-parameter version of
 /// isCommutative with the same instruction for both parameters. This is
@@ -5326,13 +5347,14 @@ class slpvectorizer::BoUpSLP {
       if (ScheduleCopyableDataMap.empty())
         return false;
       SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
-      SmallDenseMap<const TreeEntry *, unsigned> OrderedEntriesCount;
       ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(User);
       if (Entries.empty())
         return false;
+      unsigned CurNumOps = 0;
       for (const Use &U : User->operands()) {
         if (U.get() != Op)
           continue;
+        ++CurNumOps;
         // Check all tree entries, if they have operands replaced by copyable
         // data.
         for (TreeEntry *TE : Entries) {
@@ -5365,27 +5387,43 @@ class slpvectorizer::BoUpSLP {
           // Same applies even for non-commutative cmps, because we can invert
           // their predicate potentially and, thus, reorder the operands.
           bool IsCommutativeUser =
-              ::isCommutative(User) ||
-              ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User);
-          if (!IsCommutativeUser && !isa<CmpInst>(User)) {
-            unsigned &OpCnt =
-                OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
+              ::isCommutative(User) &&
+              ::isCommutableOperand(User, User, U.getOperandNo());
+          if (!IsCommutativeUser) {
+            Instruction *MainOp = TE->getMatchingMainOpOrAltOp(User);
+            IsCommutativeUser =
+                ::isCommutative(MainOp, User) &&
+                ::isCommutableOperand(MainOp, User, U.getOperandNo());
+          }
+          // The commutative user with the same operands can be safely
+          // considered as non-commutative, operands reordering does not change
+          // the semantics.
+          assert(
+              !IsCommutativeUser ||
+              (((::isCommutative(User) &&
+                 ::isCommutableOperand(User, User, 0) &&
+                 ::isCommutableOperand(User, User, 1)) ||
+                (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
+                 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User), User,
+                                       0) &&
+                 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User), User,
+                                       1)))) &&
+                  "Expected commutative user with 2 first commutable operands");
+          bool IsCommutativeWithSameOps =
+              IsCommutativeUser && User->getOperand(0) == User->getOperand(1);
+          if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
+              !isa<CmpInst>(User)) {
             EdgeInfo EI(TE, U.getOperandNo());
-            if (!getScheduleCopyableData(EI, Op))
+            if (CurNumOps != NumOps || getScheduleCopyableData(EI, Op))
               continue;
-            // Found copyable operand - continue.
-            OpCnt += Inc;
-            continue;
+            return false;
           }
           PotentiallyReorderedEntriesCount.try_emplace(TE, 0)
               .first->getSecond() += Inc;
         }
       }
       if (PotentiallyReorderedEntriesCount.empty())
-        return all_of(OrderedEntriesCount,
-                      [&](const std::pair<const TreeEntry *, unsigned> &P) {
-                        return P.second == NumOps;
-                      });
+        return true;
       // Check the commutative/cmp entries.
       for (auto &P : PotentiallyReorderedEntriesCount) {
         SmallPtrSet<Value *, 4> ParentsUniqueUsers;
@@ -5431,10 +5469,6 @@ class slpvectorizer::BoUpSLP {
       return all_of(PotentiallyReorderedEntriesCount,
                     [&](const std::pair<const TreeEntry *, unsigned> &P) {
                       return P.second == NumOps - 1;
-                    }) &&
-             all_of(OrderedEntriesCount,
-                    [&](const std::pair<const TreeEntry *, unsigned> &P) {
-                      return P.second == NumOps;
                     });
     }
 
@@ -5656,6 +5690,7 @@ class slpvectorizer::BoUpSLP {
                 }
               };
 
+          SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
@@ -5663,7 +5698,6 @@ class slpvectorizer::BoUpSLP {
             // Need to search for the lane since the tree entry can be
             // reordered.
             auto *It = find(Bundle->getTreeEntry()->Scalars, In);
-            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
             bool IsNonSchedulableWithParentPhiNode =
                 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
                 Bundle->getTreeEntry()->UserTreeIndex &&
@@ -10873,7 +10907,9 @@ class InstructionsCompatibilityAnalysis {
            Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
            Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
            Opcode == Instruction::And || Opcode == Instruction::Or ||
-           Opcode == Instruction::Xor;
+           Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
+           Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
+           Opcode == Instruction::FDiv;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -11214,6 +11250,10 @@ class InstructionsCompatibilityAnalysis {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FSub:
+      case Instruction::FDiv:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
index 0783a28f56d85..961662c664a31 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll
@@ -11,10 +11,10 @@ define void @p(double %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> <i32 1, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> <double 1.000000e+00, double 1.000000e+00, double poison, double poison>, <4 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double poison>, <4 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
index 0cc4d3db5c537..1abc16da77c8e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
@@ -4,15 +4,16 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[BB:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP9:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[FMUL:%.*]] = fmul float 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[FMUL:%.*]] = sitofp i32 0 to float
+; CHECK-NEXT:    [[SITOFP:%.*]] = sitofp i32 0 to float
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[SITOFP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float poison, float poison, float poison, float 0.000000e+00>, <4 x i32> <i32 0, i32 0, i32 poison, i32 7>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[FMUL]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
index d13a8578d1e00..c1cc3f2dfc9e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -7,36 +7,30 @@
 define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
 ; CHECK:       invoke.cont:
-; CHECK-NEXT:    br i1 %arg, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
 ; CHECK:       arrayctor.cont:
 ; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
 ; CHECK:       for.cond36.preheader:
-; CHECK-NEXT:    br i1 %arg, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
 ; CHECK:       cond.false51.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       cond.true48.us:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
 ; CHECK:       cond.false66.us:
-; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> <double poison, double 0xBFA5CC2D1960285F>, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.000000e-01>, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], splat (double 1.400000e+02)
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 5.000000e+01, double 5.200000e+01>
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr undef, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 2.000000e-01, double 3.000000e-01>, [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
+; CHECK-NEXT:    store <2 x double> <double 0x404900049667B5F2, double 0x404E0515D587DA7B>, ptr undef, align 8
+; CHECK-NEXT:    store <2 x double> <double 2.000000e-07, double 0x3F91A436DC4B6CE6>, ptr [[AGG_TMP101211_SROA_0_0_IDX]], align 8
 ; CHECK-NEXT:    ret void
 ; CHECK:       cond.true63.us:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       for.body42.lr.ph.us:
-; CHECK-NEXT:    br i1 %arg, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
 ; CHECK:       _Z5clampd.exit.1:
 ; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
 ;
@@ -96,7 +90,7 @@ _Z5clampd.exit.1:
 define void @test(i1 %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %arg, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
 ; CHECK:       if.then38:
 ; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0
 ; CHECK-NEXT:    store <2 x double> <double 0x3FFA356C1D8A7F76, double 0x3FFDC4F38B38BEF4>, ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index 6d713e83bbf4e..ca65ff88a4b81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -9,33 +9,38 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[DIV_2_I_I:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP20]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 0x7FF8000000000000, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ 1.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FA_SROA_9_0:%.*]] = phi float [ [[DIV_2_I_I]], %[[IF_THEN]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x float> [ [[TMP10]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP22]], float [[FA_SROA_9_0]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP28]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x float> [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x float> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x float> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x float> [[TMP29]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x float> <float 1.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP32:%.*]] = fmul <4 x float> <float -0.000000e+00, float -0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP31]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fadd <4 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP32]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x float> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x float> [[TMP22]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x float> <float 0.000000e+00, float 1.000000e+00, float poison, float poison>, <4 x float> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <4 x float> [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[CALL25]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 6942df532ae29..91ec61b275205 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -25,8 +25,7 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
index a07e617384e09..fd7f0c61b6737 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-for-copyable-parent.ll
@@ -6,14 +6,17 @@ define i1 @test(double %circ_radius, ptr %x) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP0]], 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[ADD20]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double poison, double poison>, double [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <4 x double> <double -0.000000e+00, double 0.000000e+00, double 1.000000e+00, double -0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> <double poison, double poison, double 0.000000e+00, double poison>, <4 x i32> <i32 1, i32 2, i32 6, i32 0>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp olt <4 x double> [[TMP9]], splat (double 1.000000e+00)

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
index eb3b183fd49eb..a9baedef3e509 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-user-with-copyable-ops.ll
@@ -6,17 +6,18 @@ define i1 @test(double %circ_radius, ptr %x, double %0) {
 ; CHECK-SAME: double [[CIRC_RADIUS:%.*]], ptr [[X:%.*]], double [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ADD20:%.*]] = fadd double [[TMP1]], 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD20]], i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], <double -0.000000e+00, double -0.000000e+00, double 0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], <double -0.000000e+00, double 0.000000e+00, double -0.000000e+00, double -0.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> <double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> poison, double [[CIRC_RADIUS]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double 0.000000e+00, double 1.000000e+00, double 0.000000e+00>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP9]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <4 x double> [[TMP7]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x double> [[TMP13]], splat (double 1.000000e+00)

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
index 8c684325f8c68..b71dbc49e7478 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-commutative-op-in-commutative-inst.ll
@@ -8,13 +8,11 @@ define void @test(ptr %quat, float %call13) {
 ; CHECK-SAME: ptr [[QUAT:%.*]], float [[CALL13:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CALL121:%.*]] = load volatile float, ptr null, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fmuladd.f32(float [[CALL13]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fmuladd.f32(float [[CALL121]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 0.000000e+00
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[CALL13]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[CALL121]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[TMP2]], <float 0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> zeroinitializer, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    store <2 x float> [[TMP7]], ptr [[QUAT]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
index f101991648276..6dc9806da0aa9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-last-instruction-in-split-node.ll
@@ -4,9 +4,7 @@
 define float @test() {
 ; CHECK-LABEL: define float @test() {
 ; CHECK-NEXT:  [[LABEL:.*]]:
-; CHECK-NEXT:    [[SUB_I102_I:%.*]] = fsub float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float poison, float poison>, float [[SUB_I102_I]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float 0.000000e+00, i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -14,26 +12,12 @@ define float @test() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <8 x float> [[TMP7]], <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> <float poison, float 1.000000e+00>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> zeroinitializer, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <12 x float> [[TMP16]], <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x float> [[TMP18]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <8 x float> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> [[TMP20]], [[TMP8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = fadd <12 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <12 x float> [[TMP22]], <12 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = fsub <8 x float> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP21]], <8 x float> poison, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <20 x float> [[TMP23]], <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <20 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef>, <20 x float> [[TMP24]], <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    br label %[[REGION_30:.*]]
 ; CHECK:       [[REGION_30]]:
-; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP25]], %[[LABEL]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <20 x float> [ [[TMP10]], %[[LABEL]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <20 x float> [[TMP26]], i32 7
 ; CHECK-NEXT:    ret float [[TMP27]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
index 7b298723d93b5..c58c63e51737c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/user-with-multi-copyable-ops.ll
@@ -11,30 +11,23 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[ARRAYIDX_I1464:%.*]] = getelementptr i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX_I1464]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[THIS]], align 8
-; CHECK-NEXT:    [[DIV251:%.*]] = fmul double [[TMP1]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL257:%.*]] = fmul double [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[MUL305:%.*]] = fmul double [[TMP4]], 0.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
-; CHECK-NEXT:    [[NEG356:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG356]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[THIS]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg double [[TMP3]]
 ; CHECK-NEXT:    [[NEG380:%.*]] = fmul double [[TMP1]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double [[MUL257]])
 ; CHECK-NEXT:    [[FNEG381:%.*]] = fneg double [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG380]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> poison, double [[DIV251]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> poison, double [[FNEG381]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[NEG417:%.*]] = fneg double [[MUL257]]
-; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG417]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[FNEG418:%.*]] = fneg double [[TMP16]]
-; CHECK-NEXT:    [[MUL419:%.*]] = fmul double [[DIV251]], [[FNEG418]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[MUL257]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fneg <2 x double> [[TMP11]]
 ; CHECK-NEXT:    [[NEG436:%.*]] = fmul double [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[FNEG437:%.*]] = fneg double [[TMP17]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> <double 1.000000e+00, double poison>, double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP14]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = fneg <2 x double> [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fneg double [[TMP4]]
 ; CHECK-NEXT:    [[NEG455:%.*]] = fmul double [[TMP1]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG455]], double 0.000000e+00, double [[MUL305]])
@@ -42,19 +35,18 @@ define void @test(ptr %this, ptr %0, double %1) {
 ; CHECK-NEXT:    [[FNEG474:%.*]] = fneg double [[TMP20]]
 ; CHECK-NEXT:    [[NEG492:%.*]] = fneg double [[MUL305]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG492]], double 0.000000e+00, double 0.000000e+00)
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x double> poison, double [[DIV251]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP22]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x double> poison, double [[FNEG437]], i32 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> [[TMP24]], double [[TMP19]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x double> [[TMP25]], double [[FNEG474]], i32 2
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP26]], double [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP28:%.*]] = fmul <4 x double> [[TMP23]], [[TMP27]]
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> <double 1.000000e+00, double 0.000000e+00>, [[TMP13]]
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP23]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG436]], double 0.000000e+00, double 0.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x double> poison, double [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x double> [[TMP29]], double [[FNEG381]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x double> [[TMP25]], double [[TMP10]], i32 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x double> [[TMP29]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP31]], double [[MUL419]], i32 3
-; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <4 x double> [[TMP28]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <8 x double> [[TMP32]], <8 x double> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> [[TMP30]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x double> [[TMP28]], double [[TMP19]], i32 5
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x double> [[TMP32]], double [[FNEG474]], i32 6
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x double> [[TMP33]], double [[TMP21]], i32 7
+; CHECK-NEXT:    [[TMP34:%.*]] = fmul <8 x double> [[TMP31]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = fptrunc <8 x double> [[TMP34]] to <8 x float>
 ; CHECK-NEXT:    store <8 x float> [[TMP35]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 2a0e7889f0f34..d10d26671e76b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
 
 define void @add0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @add0(
@@ -336,32 +336,12 @@ entry:
 }
 
 define void @add1f(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @add1f(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd fast <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @add1f(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
-; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @add1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -387,18 +367,9 @@ entry:
 define void @sub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -565,18 +536,9 @@ entry:
 define void @mulf(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulf(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -631,32 +593,12 @@ entry:
 }
 
 define void @add1fn(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @add1fn(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = fadd <3 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; NON-POW2-NEXT:    store <3 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @add1fn(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store float [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 1.000000e+00, float 2.000000e+00>
-; POW2-ONLY-NEXT:    store <2 x float> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; POW2-ONLY-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
-; POW2-ONLY-NEXT:    store float [[ADD9]], ptr [[INCDEC_PTR7]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @add1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds float, ptr %src, i64 1
@@ -682,18 +624,9 @@ entry:
 define void @sub0fn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @sub0fn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 1
-; CHECK-NEXT:    store float [[ADD]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
-; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[TMP0]], <float -1.000000e+00, float -0.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -780,18 +713,9 @@ entry:
 define void @mulfn(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-LABEL: @mulfn(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00>
-; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
-; CHECK-NEXT:    store float [[TMP2]], ptr [[INCDEC_PTR4]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[INCDEC_PTR5]], align 4
-; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
-; CHECK-NEXT:    store float [[SUB9]], ptr [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 2.570000e+02, float -3.000000e+00, float 1.000000e+00, float -9.000000e+00>
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[DST:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
index 125c2dce32663..b23da5fa263f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll
@@ -52,11 +52,12 @@ define <2 x float> @replace_through_casts_and_binop(i16 %inp) {
 ; CHECK-SAME: i16 [[INP:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[INP]], -10
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[INP]], 5
-; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i16 [[MUL]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd float [[TMP1]], 2.000000e+00
-; CHECK-NEXT:    [[TMP3:%.*]] = sitofp i16 [[ADD]] to float
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> poison, i16 [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> [[TMP1]], i16 [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[TMP5]], <float 2.000000e+00, float -0.000000e+00>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %add = add nsw i16 %inp, -10

diff  --git a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
index 793d089404d1e..c79969de6ac41 100644
--- a/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/crash_exceed_scheduling.ll
@@ -1,52 +1,98 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-schedule-budget=27 -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @exceed(double %0, double %1) {
-; CHECK-LABEL: @exceed(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
-; CHECK-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
-; CHECK-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
-; CHECK-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
-; CHECK-NEXT:    switch i32 undef, label [[BB1:%.*]] [
-; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[LABEL:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    br label [[LABEL]]
-; CHECK:       label:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
-; CHECK-NEXT:    ret void
+; X86-LABEL: @exceed(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; X86-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; X86-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; X86-NEXT:    [[IX:%.*]] = fmul double [[TMP7]], undef
+; X86-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IX1:%.*]] = fmul double [[TMP7]], undef
+; X86-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; X86-NEXT:    [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; X86-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP9]]
+; X86-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; X86-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 2>
+; X86-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], undef
+; X86-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; X86-NEXT:      i32 0, label [[BB2:%.*]]
+; X86-NEXT:    ]
+; X86:       bb1:
+; X86-NEXT:    br label [[LABEL:%.*]]
+; X86:       bb2:
+; X86-NEXT:    br label [[LABEL]]
+; X86:       label:
+; X86-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP14]], [[BB2]] ]
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @exceed(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[IXX0:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX1:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX2:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX3:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX4:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX5:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX10:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX11:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX12:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX13:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX14:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX15:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX20:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX21:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[IXX22:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; AARCH64-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0
+; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
+; AARCH64-NEXT:    [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; AARCH64-NEXT:    [[IX2:%.*]] = fmul double [[TMP7]], [[TMP7]]
+; AARCH64-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 2>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> <double poison, double 1.000000e+00>, <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = fdiv fast <2 x double> [[TMP9]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; AARCH64-NEXT:    [[IX:%.*]] = fmul double [[TMP12]], undef
+; AARCH64-NEXT:    [[IX1:%.*]] = fmul double [[TMP12]], undef
+; AARCH64-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul fast <2 x double> [[TMP13]], [[TMP8]]
+; AARCH64-NEXT:    [[IXX101:%.*]] = fsub double undef, undef
+; AARCH64-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP11]], undef
+; AARCH64-NEXT:    switch i32 undef, label [[BB1:%.*]] [
+; AARCH64-NEXT:      i32 0, label [[BB2:%.*]]
+; AARCH64-NEXT:    ]
+; AARCH64:       bb1:
+; AARCH64-NEXT:    br label [[LABEL:%.*]]
+; AARCH64:       bb2:
+; AARCH64-NEXT:    br label [[LABEL]]
+; AARCH64:       label:
+; AARCH64-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP14]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; AARCH64-NEXT:    ret void
 ;
 entry:
   %i10 = fdiv fast double %0, %1

diff  --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 32e59697486a7..439943102b58a 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -7,56 +7,52 @@ define i1 @test(float %0, double %1) {
 ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; X86-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
-; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; X86-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
-; X86-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
-; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; X86-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
+; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
+; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; X86-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 1, i32 poison>
 ; X86-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]]
+; X86-NEXT:    [[TMP13:%.*]] = fmul <4 x double> [[TMP6]], [[TMP12]]
 ; X86-NEXT:    [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; X86-NEXT:    [[TMP15:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP14]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; X86-NEXT:    [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP16]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
-; X86-NEXT:    [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]]
-; X86-NEXT:    [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; X86-NEXT:    [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float>
-; X86-NEXT:    [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer
-; X86-NEXT:    [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer
-; X86-NEXT:    [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]]
-; X86-NEXT:    [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]])
-; X86-NEXT:    ret i1 [[TMP25]]
+; X86-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP15]], [[TMP10]]
+; X86-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP15]], [[TMP10]]
+; X86-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
+; X86-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
+; X86-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
+; X86-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
+; X86-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
+; X86-NEXT:    ret i1 [[TMP23]]
 ;
 ; AARCH64-LABEL: define i1 @test
 ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
 ; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
 ; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <6 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00>, double [[TMP1]], i32 4
-; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> <i32 6, i32 7, i32 8, i32 9, i32 4, i32 5>
-; AARCH64-NEXT:    [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]]
-; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> <i32 poison, i32 4, i32 11, i32 11>
-; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double 1.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], <double 0.000000e+00, double 1.000000e+00, double 0.000000e+00, double 0.000000e+00>
+; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> <double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>, double [[TMP1]], i32 4
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> [[TMP8]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP10:%.*]] = fmul <8 x double> zeroinitializer, [[TMP9]]
+; AARCH64-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
 ; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP10]], [[TMP13]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fmul <4 x double> [[TMP6]], [[TMP13]]
 ; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP15]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP17:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <8 x double> [[TMP17]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 7>
-; AARCH64-NEXT:    [[TMP19:%.*]] = fsub <8 x double> [[TMP16]], [[TMP18]]
-; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x double> [[TMP16]], [[TMP18]]
-; AARCH64-NEXT:    [[TMP21:%.*]] = shufflevector <8 x double> [[TMP19]], <8 x double> [[TMP20]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; AARCH64-NEXT:    [[TMP22:%.*]] = fptrunc <8 x double> [[TMP21]] to <8 x float>
-; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], zeroinitializer
-; AARCH64-NEXT:    [[TMP24:%.*]] = fcmp oeq <8 x float> [[TMP23]], zeroinitializer
-; AARCH64-NEXT:    [[TMP25:%.*]] = freeze <8 x i1> [[TMP24]]
-; AARCH64-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP25]])
-; AARCH64-NEXT:    ret i1 [[TMP26]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <8 x double> [[TMP16]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <8 x double> [[TMP16]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP20:%.*]] = fptrunc <8 x double> [[TMP19]] to <8 x float>
+; AARCH64-NEXT:    [[TMP21:%.*]] = fmul <8 x float> [[TMP20]], zeroinitializer
+; AARCH64-NEXT:    [[TMP22:%.*]] = fcmp oeq <8 x float> [[TMP21]], zeroinitializer
+; AARCH64-NEXT:    [[TMP23:%.*]] = freeze <8 x i1> [[TMP22]]
+; AARCH64-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP23]])
+; AARCH64-NEXT:    ret i1 [[TMP24]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double

diff  --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index eefc99feebb95..09e3ef41b3dbe 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -6,34 +6,34 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778)
 ; X86-LABEL: @test(
 ; X86-NEXT:  entry:
 ; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
+; X86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
 ; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
+; X86-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; X86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
 ; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; X86-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AARCH86-LABEL: @test(
 ; AARCH86-NEXT:  entry:
 ; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; AARCH86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[I1771]], align 8
+; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 0>
+; AARCH86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> [[TMP4]], double [[I1754:%.*]], i32 0
 ; AARCH86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; AARCH86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; AARCH86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; AARCH86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; AARCH86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781:%.*]], i32 2
+; AARCH86-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer
+; AARCH86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP2]], [[TMP10]]
+; AARCH86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 1>
+; AARCH86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 1>
+; AARCH86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; AARCH86-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
 ; AARCH86-NEXT:    [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
 ; AARCH86-NEXT:    ret <4 x double> [[I1994]]
 ;