[llvm] cfe1909 - Revert "[SLP]Initial FMAD support (#149102)"

Sun Aug 10 07:16:06 PDT 2025

Author: David Green
Date: 2025-08-10T15:16:01+01:00
New Revision: cfe190979e8272c3192a5ce576b12daa6a8bfd8b

URL: https://github.com/llvm/llvm-project/commit/cfe190979e8272c3192a5ce576b12daa6a8bfd8b
DIFF: https://github.com/llvm/llvm-project/commit/cfe190979e8272c3192a5ce576b12daa6a8bfd8b.diff

LOG: Revert "[SLP]Initial FMAD support (#149102)"

This reverts commit 0fffb9f9ed81f4c2084b8fe040c88b60bb6c372a due to major
performance regressions.

Added: 
    

Modified: 
    llvm/docs/ReleaseNotes.md
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
    llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
    llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
    llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
    llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
    llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
    llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
    llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 88b7e6d6585f7..81710585baa9e 100644

--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -77,7 +77,6 @@ Changes to Vectorizers
 
 * Added initial support for copyable elements in SLP, which models copyable
   elements as add <element>, 0, i.e. uses identity constants for missing lanes.
-* SLP vectorizer supports initial recognition of FMA/FMAD pattern
 
 Changes to the AArch64 Backend
 ------------------------------

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ec06a217667c2..5d0e2f9518a51 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3883,7 +3883,6 @@ class BoUpSLP {
     enum CombinedOpcode {
       NotCombinedOp = -1,
       MinMax = Instruction::OtherOpsEnd + 1,
-      FMulAdd,
     };
     CombinedOpcode CombinedOp = NotCombinedOp;
 
@@ -4034,9 +4033,6 @@ class BoUpSLP {
     /// Returns true if any scalar in the list is a copyable element.
     bool hasCopyableElements() const { return !CopyableElements.empty(); }
 
-    /// Returns the state of the operations.
-    const InstructionsState &getOperations() const { return S; }
-
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -11991,82 +11987,6 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
   }
 }
 
-static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
-                                       const InstructionsState &S,
-                                       DominatorTree &DT, const DataLayout &DL,
-                                       TargetTransformInfo &TTI,
-                                       const TargetLibraryInfo &TLI) {
-  assert(all_of(VL,
-                [](Value *V) {
-                  return V->getType()->getScalarType()->isFloatingPointTy();
-                }) &&
-         "Can only convert to FMA for floating point types");
-  assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
-
-  auto CheckForContractable = [&](ArrayRef<Value *> VL) {
-    FastMathFlags FMF;
-    FMF.set();
-    for (Value *V : VL) {
-      auto *I = dyn_cast<Instruction>(V);
-      if (!I)
-        continue;
-      // TODO: support for copyable elements.
-      Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
-      if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
-        continue;
-      if (auto *FPCI = dyn_cast<FPMathOperator>(I))
-        FMF &= FPCI->getFastMathFlags();
-    }
-    return FMF.allowContract();
-  };
-  if (!CheckForContractable(VL))
-    return InstructionCost::getInvalid();
-  // fmul also should be contractable
-  InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
-  SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
-
-  InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
-  if (!OpS.valid())
-    return InstructionCost::getInvalid();
-  if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
-    return InstructionCost::getInvalid();
-  if (!CheckForContractable(Operands.front()))
-    return InstructionCost::getInvalid();
-  // Compare the costs.
-  InstructionCost FMulPlusFAddCost = 0;
-  InstructionCost FMACost = 0;
-  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  FastMathFlags FMF;
-  FMF.set();
-  for (Value *V : VL) {
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      continue;
-    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
-      FMF &= FPCI->getFastMathFlags();
-    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
-  }
-  unsigned NumOps = 0;
-  for (auto [V, Op] : zip(VL, Operands.front())) {
-    auto *I = dyn_cast<Instruction>(Op);
-    if (!I || !I->hasOneUse()) {
-      if (auto *OpI = dyn_cast<Instruction>(V))
-        FMACost += TTI.getInstructionCost(OpI, CostKind);
-      if (I)
-        FMACost += TTI.getInstructionCost(I, CostKind);
-      continue;
-    }
-    ++NumOps;
-    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
-      FMF &= FPCI->getFastMathFlags();
-    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
-  }
-  Type *Ty = VL.front()->getType();
-  IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
-  FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
-  return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
-}
-
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   BaseGraphSize = VectorizableTree.size();
@@ -12435,25 +12355,6 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
-    case Instruction::FSub:
-    case Instruction::FAdd: {
-      // Check if possible to convert (a*b)+c to fma.
-      if (E.State != TreeEntry::Vectorize ||
-          !E.getOperations().isAddSubLikeOp())
-        break;
-      if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
-               .isValid())
-        break;
-      // This node is a fmuladd node.
-      E.CombinedOp = TreeEntry::FMulAdd;
-      TreeEntry *FMulEntry = getOperandEntry(&E, 0);
-      if (FMulEntry->UserTreeIndex &&
-          FMulEntry->State == TreeEntry::Vectorize) {
-        // The FMul node is part of the combined fmuladd node.
-        FMulEntry->State = TreeEntry::CombinedVectorize;
-      }
-      break;
-    }
     default:
       break;
     }
@@ -13686,11 +13587,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     }
     return IntrinsicCost;
   };
-  auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
-                                         Instruction *VI) {
-    InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
-    return Cost;
-  };
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -14031,30 +13927,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
-  case TreeEntry::FMulAdd: {
-    auto GetScalarCost = [&](unsigned Idx) {
-      if (isa<PoisonValue>(UniqueValues[Idx]))
-        return InstructionCost(TTI::TCC_Free);
-      return GetFMulAddCost(E->getOperations(),
-                            cast<Instruction>(UniqueValues[Idx]));
-    };
-    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
-      FastMathFlags FMF;
-      FMF.set();
-      for (Value *V : E->Scalars) {
-        if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
-          FMF &= FPCI->getFastMathFlags();
-          if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
-            FMF &= FPCIOp->getFastMathFlags();
-        }
-      }
-      IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
-                                  {VecTy, VecTy, VecTy}, FMF);
-      InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
-      return VecCost + CommonCost;
-    };
-    return GetCostDiff(GetScalarCost, GetVectorCost);
-  }
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -14092,16 +13964,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       }
       TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);
       TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);
-      InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
-          ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
-      if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
-          I && (ShuffleOrOp == Instruction::FAdd ||
-                ShuffleOrOp == Instruction::FSub)) {
-        InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
-        if (IntrinsicCost.isValid())
-          ScalarCost = IntrinsicCost;
-      }
-      return ScalarCost;
+      return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
+                                         Op1Info, Op2Info, Operands);
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
@@ -22730,21 +22594,11 @@ class HorizontalReduction {
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
                                  ScalarEvolution &SE, const DataLayout &DL,
-                                 const TargetLibraryInfo &TLI,
-                                 DominatorTree &DT, TargetTransformInfo &TTI) {
+                                 const TargetLibraryInfo &TLI) {
     RdxKind = HorizontalReduction::getRdxKind(Root);
     if (!isVectorizable(RdxKind, Root))
       return false;
 
-    // FMA reduction root - skip.
-    auto CheckForFMA = [&](Instruction *I) {
-      return RdxKind == RecurKind::FAdd &&
-             canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
-                 .isValid();
-    };
-    if (CheckForFMA(Root))
-      return false;
-
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     Type *Ty = Root->getType();
@@ -22782,7 +22636,7 @@ class HorizontalReduction {
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
         if (!EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
+            getRdxKind(EdgeInst) != RdxKind ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -24351,13 +24205,13 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   Stack.emplace(SelectRoot(), 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
-  auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
+  auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
     if (R.isAnalyzedReductionRoot(Inst))
       return nullptr;
     if (!isReductionCandidate(Inst))
       return nullptr;
     HorizontalReduction HorRdx;
-    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
+    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
     return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };
@@ -24423,12 +24277,6 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
 
   if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
     return false;
-  // Skip potential FMA candidates.
-  if ((I->getOpcode() == Instruction::FAdd ||
-       I->getOpcode() == Instruction::FSub) &&
-      canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
-          .isValid())
-    return false;
 
   Value *P = I->getParent();
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 9e086dcad686c..442769937ac12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -8,18 +8,15 @@ target triple = "aarch64--linux-gnu"
 define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
-; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL11]], [[MUL12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -50,18 +47,15 @@ for.end27:
 define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
-; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 2e684320ba10e..295a71899c338 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -12,8 +12,7 @@ define void @test() {
 ; CHECK:       [[BB63]]:
 ; CHECK-NEXT:    br label %[[BB64]]
 ; CHECK:       [[BB64]]:
-; CHECK-NEXT:    [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
-; CHECK-NEXT:    [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
 ; CHECK-NEXT:    [[I66:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I67:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I68:%.*]] = load float, ptr poison, align 8
@@ -25,125 +24,57 @@ define void @test() {
 ; CHECK-NEXT:    [[I74:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I75:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I76:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
 ; CHECK:       [[BB77]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
-; CHECK-NEXT:    [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[I87:%.*]] = fmul fast float [[I85]], poison
-; CHECK-NEXT:    [[I88:%.*]] = fmul fast float [[I80]], poison
-; CHECK-NEXT:    [[I89:%.*]] = fmul fast float [[I81]], poison
-; CHECK-NEXT:    [[I90:%.*]] = fmul fast float [[I82]], poison
-; CHECK-NEXT:    [[I91:%.*]] = fmul fast float [[I84]], poison
-; CHECK-NEXT:    [[I92:%.*]] = fadd fast float [[I91]], [[I87]]
-; CHECK-NEXT:    [[I93:%.*]] = fmul fast float [[I127]], poison
-; CHECK-NEXT:    [[I94:%.*]] = fadd fast float [[I93]], [[I88]]
-; CHECK-NEXT:    [[I95:%.*]] = fmul fast float [[I131]], poison
-; CHECK-NEXT:    [[I96:%.*]] = fadd fast float [[I95]], [[I89]]
-; CHECK-NEXT:    [[I97:%.*]] = fmul fast float [[I86]], poison
-; CHECK-NEXT:    [[I98:%.*]] = fadd fast float [[I97]], [[I90]]
-; CHECK-NEXT:    [[I99:%.*]] = fadd fast float [[I92]], poison
-; CHECK-NEXT:    [[I100:%.*]] = fadd fast float [[I94]], poison
-; CHECK-NEXT:    [[I101:%.*]] = fadd fast float [[I96]], poison
-; CHECK-NEXT:    [[I102:%.*]] = fadd fast float [[I98]], poison
-; CHECK-NEXT:    [[I103]] = fadd fast float [[I99]], poison
-; CHECK-NEXT:    [[I104]] = fadd fast float [[I100]], poison
-; CHECK-NEXT:    [[I105]] = fadd fast float [[I101]], poison
-; CHECK-NEXT:    [[I106]] = fadd fast float [[I102]], poison
-; CHECK-NEXT:    [[I107:%.*]] = fmul fast float [[I85]], poison
-; CHECK-NEXT:    [[I108:%.*]] = fmul fast float [[I80]], poison
-; CHECK-NEXT:    [[I109:%.*]] = fmul fast float [[I81]], poison
-; CHECK-NEXT:    [[I110:%.*]] = fmul fast float [[I82]], poison
-; CHECK-NEXT:    [[I111:%.*]] = fmul fast float [[I84]], poison
-; CHECK-NEXT:    [[I112:%.*]] = fadd fast float [[I111]], [[I107]]
-; CHECK-NEXT:    [[I113:%.*]] = fmul fast float [[I127]], poison
-; CHECK-NEXT:    [[I114:%.*]] = fadd fast float [[I113]], [[I108]]
-; CHECK-NEXT:    [[I115:%.*]] = fmul fast float [[I131]], poison
-; CHECK-NEXT:    [[I116:%.*]] = fadd fast float [[I115]], [[I109]]
-; CHECK-NEXT:    [[I117:%.*]] = fmul fast float [[I86]], poison
-; CHECK-NEXT:    [[I118:%.*]] = fadd fast float [[I117]], [[I110]]
-; CHECK-NEXT:    [[I119:%.*]] = fadd fast float [[I112]], poison
-; CHECK-NEXT:    [[I120:%.*]] = fadd fast float [[I114]], poison
-; CHECK-NEXT:    [[I121:%.*]] = fadd fast float [[I116]], poison
-; CHECK-NEXT:    [[I122:%.*]] = fadd fast float [[I118]], poison
-; CHECK-NEXT:    [[I123]] = fadd fast float [[I119]], poison
-; CHECK-NEXT:    [[I124]] = fadd fast float [[I120]], poison
-; CHECK-NEXT:    [[I125]] = fadd fast float [[I121]], poison
-; CHECK-NEXT:    [[I126]] = fadd fast float [[I122]], poison
-; CHECK-NEXT:    [[I135:%.*]] = fmul fast float [[I85]], [[I65]]
-; CHECK-NEXT:    [[I128:%.*]] = fmul fast float [[I80]], [[I65]]
-; CHECK-NEXT:    [[I129:%.*]] = fmul fast float [[I81]], [[I65]]
-; CHECK-NEXT:    [[I130:%.*]] = fmul fast float [[I82]], [[I65]]
-; CHECK-NEXT:    [[I133:%.*]] = fmul fast float [[I84]], [[I77]]
-; CHECK-NEXT:    [[I134:%.*]] = fadd fast float [[I133]], [[I135]]
-; CHECK-NEXT:    [[I136:%.*]] = fmul fast float [[I127]], [[I77]]
-; CHECK-NEXT:    [[TMP51:%.*]] = fadd fast float [[I136]], [[I128]]
-; CHECK-NEXT:    [[I138:%.*]] = fmul fast float [[I131]], [[I77]]
-; CHECK-NEXT:    [[TMP52:%.*]] = fadd fast float [[I138]], [[I129]]
-; CHECK-NEXT:    [[I137:%.*]] = fmul fast float [[I86]], [[I77]]
-; CHECK-NEXT:    [[I139:%.*]] = fadd fast float [[I137]], [[I130]]
-; CHECK-NEXT:    [[I140:%.*]] = fadd fast float [[I134]], poison
-; CHECK-NEXT:    [[I141:%.*]] = fadd fast float [[TMP51]], poison
-; CHECK-NEXT:    [[I142:%.*]] = fadd fast float [[TMP52]], poison
-; CHECK-NEXT:    [[I143:%.*]] = fadd fast float [[I139]], poison
-; CHECK-NEXT:    [[I144:%.*]] = fadd fast float [[I140]], poison
-; CHECK-NEXT:    [[I145:%.*]] = fadd fast float [[I141]], poison
-; CHECK-NEXT:    [[I146:%.*]] = fadd fast float [[I142]], poison
-; CHECK-NEXT:    [[I152:%.*]] = fadd fast float [[I143]], poison
-; CHECK-NEXT:    [[I147:%.*]] = fmul fast float [[I85]], poison
-; CHECK-NEXT:    [[I148:%.*]] = fmul fast float [[I80]], poison
-; CHECK-NEXT:    [[I149:%.*]] = fmul fast float [[I81]], poison
-; CHECK-NEXT:    [[I150:%.*]] = fmul fast float [[I82]], poison
-; CHECK-NEXT:    [[I151:%.*]] = fmul fast float [[I84]], poison
-; CHECK-NEXT:    [[TMP57:%.*]] = fadd fast float [[I151]], [[I147]]
-; CHECK-NEXT:    [[I153:%.*]] = fmul fast float [[I127]], poison
-; CHECK-NEXT:    [[TMP58:%.*]] = fadd fast float [[I153]], [[I148]]
-; CHECK-NEXT:    [[I155:%.*]] = fmul fast float [[I131]], poison
-; CHECK-NEXT:    [[TMP59:%.*]] = fadd fast float [[I155]], [[I149]]
-; CHECK-NEXT:    [[I157:%.*]] = fmul fast float [[I86]], poison
-; CHECK-NEXT:    [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]]
-; CHECK-NEXT:    [[I159:%.*]] = fadd fast float [[TMP57]], poison
-; CHECK-NEXT:    [[I160:%.*]] = fadd fast float [[TMP58]], poison
-; CHECK-NEXT:    [[I161:%.*]] = fadd fast float [[TMP59]], poison
-; CHECK-NEXT:    [[I162:%.*]] = fadd fast float [[TMP60]], poison
-; CHECK-NEXT:    [[I163:%.*]] = fadd fast float [[I159]], poison
-; CHECK-NEXT:    [[I164:%.*]] = fadd fast float [[I160]], poison
-; CHECK-NEXT:    [[I165:%.*]] = fadd fast float [[I161]], poison
-; CHECK-NEXT:    [[I166:%.*]] = fadd fast float [[I162]], poison
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
+; CHECK-NEXT:    [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
 ; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
 ; CHECK:       [[BB167]]:
-; CHECK-NEXT:    [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[I166]], %[[BB78]] ]
-; CHECK-NEXT:    [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I165]], %[[BB78]] ]
-; CHECK-NEXT:    [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I164]], %[[BB78]] ]
-; CHECK-NEXT:    [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[I163]], %[[BB78]] ]
-; CHECK-NEXT:    [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[I152]], %[[BB78]] ]
-; CHECK-NEXT:    [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[I146]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[I145]], %[[BB78]] ]
-; CHECK-NEXT:    [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[I144]], %[[BB78]] ]
-; CHECK-NEXT:    [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[I126]], %[[BB78]] ]
-; CHECK-NEXT:    [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I125]], %[[BB78]] ]
-; CHECK-NEXT:    [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[I124]], %[[BB78]] ]
-; CHECK-NEXT:    [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I123]], %[[BB78]] ]
-; CHECK-NEXT:    [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I106]], %[[BB78]] ]
-; CHECK-NEXT:    [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[I105]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[I104]], %[[BB78]] ]
-; CHECK-NEXT:    [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[I103]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP32]], i32 14
 ; CHECK-NEXT:    store float [[TMP33]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x float> [[TMP32]], i32 13
 ; CHECK-NEXT:    store float [[TMP34]], ptr poison, align 1
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP32]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]]
 ; CHECK:       [[BB184]]:
 ; CHECK-NEXT:    br label %[[BB185:.*]]
 ; CHECK:       [[BB185]]:
 ; CHECK-NEXT:    br i1 poison, label %[[BB185]], label %[[BB186]]
 ; CHECK:       [[BB186]]:
-; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[I178]], %[[BB167]] ], [ poison, %[[BB185]] ]
+; CHECK-NEXT:    [[I187:%.*]] = phi nsz float [ [[TMP35]], %[[BB167]] ], [ poison, %[[BB185]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index 8093285ad8717..64bdcf28af550 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -8,56 +8,35 @@
 define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
 ; CHECK-LABEL: @zot(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
-; CHECK-NEXT:    [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00
-; CHECK-NEXT:    [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00
-; CHECK-NEXT:    [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT:    [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP9]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[TMP6]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
 ; CHECK:       bb18:
-; CHECK-NEXT:    [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ]
-; CHECK-NEXT:    [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ]
-; CHECK-NEXT:    [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ]
-; CHECK-NEXT:    [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x float> [ [[TMP7]], [[BB:%.*]] ]
+; CHECK-NEXT:    [[VAL16:%.*]] = extractelement <4 x float> [[TMP7]], i32 2
 ; CHECK-NEXT:    [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
+; CHECK-NEXT:    [[VAL17:%.*]] = extractelement <4 x float> [[TMP7]], i32 3
 ; CHECK-NEXT:    [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
 ; CHECK-NEXT:    br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
 ; CHECK:       bb25:
-; CHECK-NEXT:    [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ]
-; CHECK-NEXT:    [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x float> [ [[TMP8]], [[BB18]] ]
 ; CHECK-NEXT:    br label [[BB30:%.*]]
 ; CHECK:       bb30:
 ; CHECK-NEXT:    [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
 ; CHECK-NEXT:    [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT:    [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT:    [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float
-; CHECK-NEXT:    [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1
-; CHECK-NEXT:    [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1
-; CHECK-NEXT:    [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float
-; CHECK-NEXT:    [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2
-; CHECK-NEXT:    [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1
-; CHECK-NEXT:    [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float
-; CHECK-NEXT:    [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3
-; CHECK-NEXT:    [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1
-; CHECK-NEXT:    [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float
-; CHECK-NEXT:    [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]]
-; CHECK-NEXT:    [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]]
-; CHECK-NEXT:    [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]]
-; CHECK-NEXT:    [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]]
-; CHECK-NEXT:    [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]]
-; CHECK-NEXT:    [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]]
-; CHECK-NEXT:    [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]]
-; CHECK-NEXT:    [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]]
-; CHECK-NEXT:    [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]]
-; CHECK-NEXT:    [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]]
-; CHECK-NEXT:    [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP15]])
 ; CHECK-NEXT:    [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
 ; CHECK-NEXT:    [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
 ; CHECK-NEXT:    call void @ham(float [[VAL55]], float [[VAL56]])

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 430a46beace9a..27de36e601512 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -600,25 +600,29 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
 }
 
 define float @dot_product_fp32(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_fp32(
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
-; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
-; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT:    ret float [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_fp32(
+; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
+; NON-POW2-NEXT:    ret float [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_fp32(
+; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
+; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
+; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT:    ret float [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
   %l.a.0 = load float, ptr %gep.a.0, align 4
@@ -646,25 +650,29 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
 ; Same as above, except the reduction order has been perturbed.  This
 ; is checking for our ability to reorder.
 define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_fp32_reorder(
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
-; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; CHECK-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
-; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; CHECK-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT:    ret float [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_fp32_reorder(
+; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
+; NON-POW2-NEXT:    ret float [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
+; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
+; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
+; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT:    ret float [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
   %l.a.0 = load float, ptr %gep.a.0, align 4
@@ -691,25 +699,29 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
 
 
 define double @dot_product_fp64(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_fp64(
-; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
-; CHECK-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
-; CHECK-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; CHECK-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
-; CHECK-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
-; CHECK-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT:    ret double [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_fp64(
+; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
+; NON-POW2-NEXT:    ret double [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_fp64(
+; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
+; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
+; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT:    [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
+; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
+; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT:    ret double [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
   %l.a.0 = load double, ptr %gep.a.0, align 4
@@ -766,13 +778,21 @@ entry:
 }
 
 define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
-; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
-; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT:    ret float [[ADD_1]]
+; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
+; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
+; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
+; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
+; NON-POW2-NEXT:    [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
+; NON-POW2-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
+; NON-POW2-NEXT:    ret float [[TMP5]]
+;
+; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
+; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
+; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
+; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
+; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT:    ret float [[ADD_1]]
 ;
   %mul.0 = fmul fast float %a, 10.0
   %mul.1 = fmul fast float %b, 10.0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index 0879ec239e287..4a8af6d03da06 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3]))
@@ -95,47 +95,12 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
 }
 
 define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; SSE2-LABEL: @dot4f64_fast(
-; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; SSE2-NEXT:    ret double [[TMP4]]
-;
-; SSE4-LABEL: @dot4f64_fast(
-; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; SSE4-NEXT:    ret double [[TMP4]]
-;
-; AVX-LABEL: @dot4f64_fast(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; AVX-NEXT:    ret double [[TMP4]]
-;
-; AVX2-LABEL: @dot4f64_fast(
-; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
-; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
-; AVX2-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX]], i64 2
-; AVX2-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY]], i64 2
-; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
-; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
-; AVX2-NEXT:    [[X1:%.*]] = load double, ptr [[PTRX1]], align 4
-; AVX2-NEXT:    [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4
-; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; AVX2-NEXT:    [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
-; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]]
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP4]]
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; AVX2-NEXT:    [[DOT0123:%.*]] = fadd fast double [[DOT012]], [[TMP5]]
-; AVX2-NEXT:    ret double [[DOT0123]]
+; CHECK-LABEL: @dot4f64_fast(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; CHECK-NEXT:    ret double [[TMP4]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -162,47 +127,12 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
 }
 
 define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; SSE2-LABEL: @dot4f32_fast(
-; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; SSE2-NEXT:    ret float [[TMP4]]
-;
-; SSE4-LABEL: @dot4f32_fast(
-; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; SSE4-NEXT:    ret float [[TMP4]]
-;
-; AVX-LABEL: @dot4f32_fast(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; AVX-NEXT:    ret float [[TMP4]]
-;
-; AVX2-LABEL: @dot4f32_fast(
-; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
-; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
-; AVX2-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX]], i64 2
-; AVX2-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY]], i64 2
-; AVX2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
-; AVX2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
-; AVX2-NEXT:    [[X1:%.*]] = load float, ptr [[PTRX1]], align 4
-; AVX2-NEXT:    [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4
-; AVX2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
-; AVX2-NEXT:    [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
-; AVX2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]]
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; AVX2-NEXT:    [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP4]]
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; AVX2-NEXT:    [[DOT0123:%.*]] = fadd fast float [[DOT012]], [[TMP5]]
-; AVX2-NEXT:    ret float [[DOT0123]]
+; CHECK-LABEL: @dot4f32_fast(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT:    ret float [[TMP4]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
@@ -441,18 +371,6 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1
 ; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
 ; AVX-NEXT:    [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    ret double [[DOT01]]
-;
-; AVX2-LABEL: @dot2f64_fast(
-; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
-; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
-; AVX2-NEXT:    [[X0:%.*]] = load double, ptr [[PTRX]], align 4
-; AVX2-NEXT:    [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
-; AVX2-NEXT:    [[X1:%.*]] = load double, ptr [[PTRX1]], align 4
-; AVX2-NEXT:    [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4
-; AVX2-NEXT:    [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; AVX2-NEXT:    [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
-; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]]
-; AVX2-NEXT:    ret double [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -491,18 +409,6 @@ define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16
 ; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
 ; AVX-NEXT:    [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; AVX-NEXT:    ret float [[DOT01]]
-;
-; AVX2-LABEL: @dot2f32_fast(
-; AVX2-NEXT:    [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
-; AVX2-NEXT:    [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
-; AVX2-NEXT:    [[X0:%.*]] = load float, ptr [[PTRX]], align 4
-; AVX2-NEXT:    [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
-; AVX2-NEXT:    [[X1:%.*]] = load float, ptr [[PTRX1]], align 4
-; AVX2-NEXT:    [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4
-; AVX2-NEXT:    [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
-; AVX2-NEXT:    [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
-; AVX2-NEXT:    [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]]
-; AVX2-NEXT:    ret float [[DOT01]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 0bbdeb55e1516..eaa77d74f8df1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -31,9 +31,12 @@ define float @baz() {
 ; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], 2.000000e+00
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP4]], 2.000000e+00
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 1
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], splat (float 2.000000e+00)
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
 ; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
 ; THRESHOLD-NEXT:    store float [[OP_RDX]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
@@ -73,41 +76,14 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; CHECK-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
-; CHECK-NEXT:    [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
-; CHECK-NEXT:    [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
-; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
-; CHECK-NEXT:    [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
 ; CHECK-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; CHECK-NEXT:    ret float [[OP_RDX1]]
 ;
@@ -116,41 +92,14 @@ define float @bazz() {
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; THRESHOLD-NEXT:    [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
 ; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
-; THRESHOLD-NEXT:    [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
-; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
-; THRESHOLD-NEXT:    [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
-; THRESHOLD-NEXT:    [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
-; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
-; THRESHOLD-NEXT:    [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
-; THRESHOLD-NEXT:    [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
-; THRESHOLD-NEXT:    [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
-; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
-; THRESHOLD-NEXT:    [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
-; THRESHOLD-NEXT:    [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
-; THRESHOLD-NEXT:    [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
-; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
 ; THRESHOLD-NEXT:    store float [[OP_RDX1]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[OP_RDX1]]
 ;
@@ -202,21 +151,10 @@ define float @bazzz() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    store float [[TMP5]], ptr @res, align 4
 ; CHECK-NEXT:    ret float [[TMP5]]
@@ -225,21 +163,10 @@ define float @bazzz() {
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; THRESHOLD-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    store float [[TMP5]], ptr @res, align 4
 ; THRESHOLD-NEXT:    ret float [[TMP5]]
@@ -272,21 +199,10 @@ define i32 @foo() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; CHECK-NEXT:    store i32 [[CONV4]], ptr @n, align 4
@@ -296,21 +212,10 @@ define i32 @foo() {
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, ptr @n, align 4
 ; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, ptr @arr, align 16
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; THRESHOLD-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; THRESHOLD-NEXT:    [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; THRESHOLD-NEXT:    [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; THRESHOLD-NEXT:    [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; THRESHOLD-NEXT:    [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; THRESHOLD-NEXT:    [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; THRESHOLD-NEXT:    [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
 ; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
 ; THRESHOLD-NEXT:    store i32 [[CONV4]], ptr @n, align 4

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 45279296d296a..1922e935cee4b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,65 +10,17 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
-; CHECK-NEXT:    [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8
-; CHECK-NEXT:    [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
-; CHECK-NEXT:    [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], [[LD1_0]]
-; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 16
-; CHECK-NEXT:    [[LD2_0:%.*]] = load double, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[MUL2_0:%.*]] = fmul fast double [[LD2_0]], [[LD1_0]]
-; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3
-; CHECK-NEXT:    [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8
-; CHECK-NEXT:    [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
-; CHECK-NEXT:    [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
-; CHECK-NEXT:    [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], [[LD1_1]]
-; CHECK-NEXT:    [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
-; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 17
-; CHECK-NEXT:    [[LD2_1:%.*]] = load double, ptr [[GEP2_1]], align 8
-; CHECK-NEXT:    [[MUL2_1:%.*]] = fmul fast double [[LD2_1]], [[LD1_1]]
-; CHECK-NEXT:    [[RDX2_0:%.*]] = fadd fast double [[MUL2_0]], [[MUL2_1]]
-; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5
-; CHECK-NEXT:    [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8
-; CHECK-NEXT:    [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
-; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 18
-; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7
-; CHECK-NEXT:    [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8
-; CHECK-NEXT:    [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9
-; CHECK-NEXT:    [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8
-; CHECK-NEXT:    [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11
-; CHECK-NEXT:    [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[LD1_2]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[LD1_3]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[LD1_4]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[LD1_5]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[GEP2_2]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13
-; CHECK-NEXT:    [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8
-; CHECK-NEXT:    [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
-; CHECK-NEXT:    [[LD0_6:%.*]] = load double, ptr [[GEP0_6]], align 8
-; CHECK-NEXT:    [[MUL1_6:%.*]] = fmul fast double [[LD0_6]], [[LD1_6]]
-; CHECK-NEXT:    [[GEP2_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 22
-; CHECK-NEXT:    [[LD2_6:%.*]] = load double, ptr [[GEP2_6]], align 8
-; CHECK-NEXT:    [[MUL2_6:%.*]] = fmul fast double [[LD2_6]], [[LD1_6]]
-; CHECK-NEXT:    [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15
-; CHECK-NEXT:    [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8
-; CHECK-NEXT:    [[GEP0_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 7
-; CHECK-NEXT:    [[LD0_7:%.*]] = load double, ptr [[GEP0_7]], align 8
-; CHECK-NEXT:    [[MUL1_7:%.*]] = fmul fast double [[LD0_7]], [[LD1_7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast double [[TMP10]], [[MUL1_6]]
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = fadd fast double [[MUL1_7]], [[RDX1_0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast double [[OP_RDX3]], [[OP_RDX4]]
-; CHECK-NEXT:    [[GEP2_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 23
-; CHECK-NEXT:    [[LD2_7:%.*]] = load double, ptr [[GEP2_7]], align 8
-; CHECK-NEXT:    [[MUL2_7:%.*]] = fmul fast double [[LD2_7]], [[LD1_7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP11]], [[MUL2_6]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast double [[MUL2_7]], [[RDX2_0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast double [[OP_RDX]], [[OP_RDX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
+; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
 ; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
 ; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
index 33c281d3f0166..f0272d591f0c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
@@ -6,25 +6,9 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @rdx_feeds_single_insert(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
-; CHECK-NEXT:    [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], 1.000000e+01
-; CHECK-NEXT:    [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
-; CHECK-NEXT:    [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
-; CHECK-NEXT:    [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], 1.100000e+01
-; CHECK-NEXT:    [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
-; CHECK-NEXT:    [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <4 x double> [[TMP0]], <double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01>
-; CHECK-NEXT:    [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[GEP0_6]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> [[TMP3]], <double 1.600000e+01, double 1.700000e+01>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RDX1_0]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], <double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01>
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]])
 ; CHECK-NEXT:    [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
index 359c24b00e92e..8c9f8b5868d4f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -1,39 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
-; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
-; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=AVX
+; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=CHECK
 
 ; This test checks for a case when a horizontal reduction of floating-point
 ; adds may look profitable, but is not because it eliminates generation of
 ; floating-point FMAs that would be more profitable.
 
+; FIXME: We generate a horizontal reduction today.
+
 define void @hr() {
-; SSE4-LABEL: @hr(
-; SSE4-NEXT:    br label [[LOOP:%.*]]
-; SSE4:       loop:
-; SSE4-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
-; SSE4-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
-; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
-; SSE4-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
-; SSE4-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
-; SSE4-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
-; SSE4-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
-; SSE4:       exit:
-; SSE4-NEXT:    ret void
-;
-; AVX-LABEL: @hr(
-; AVX-NEXT:    br label [[LOOP:%.*]]
-; AVX:       loop:
-; AVX-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
-; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
-; AVX-NEXT:    [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
-; AVX-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
-; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]]
-; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]]
-; AVX-NEXT:    [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]]
-; AVX-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
-; AVX:       exit:
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @hr(
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
   br label %loop
 
@@ -59,27 +47,18 @@ exit:
 ; may look profitable; but both are not because this eliminates generation
 ; of floating-point FMAs that would be more profitable.
 
+; FIXME: We generate a horizontal reduction today, and if that's disabled, we
+; still vectorize some of the multiplies.
+
 define double @hr_or_mul() {
-; SSE4-LABEL: @hr_or_mul(
-; SSE4-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
-; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
-; SSE4-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
-; SSE4-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; SSE4-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
-; SSE4-NEXT:    ret double [[OP_RDX]]
-;
-; AVX-LABEL: @hr_or_mul(
-; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
-; AVX-NEXT:    [[MUL0:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
-; AVX-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[CVT0]]
-; AVX-NEXT:    [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
-; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]]
-; AVX-NEXT:    [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
-; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
-; AVX-NEXT:    [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
-; AVX-NEXT:    [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
-; AVX-NEXT:    ret double [[ADD3]]
+; CHECK-LABEL: @hr_or_mul(
+; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
+; CHECK-NEXT:    ret double [[OP_RDX]]
 ;
   %cvt0 = uitofp i16 3 to double
   %mul0 = fmul fast double 7.000000e+00, %cvt0

diff  --git a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
index 5fe02cbc84872..a64075db37ba1 100644
--- a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
@@ -1,57 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
 
 define void @test() {
-; X86-LABEL: @test(
-; X86-NEXT:  entry:
-; X86-NEXT:    br label [[BODY:%.*]]
-; X86:       body:
-; X86-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
-; X86-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
-; X86-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
-; X86-NEXT:    [[TMP2:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP1]]
-; X86-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP2]])
-; X86-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[TMP3]], 0.000000e+00
-; X86-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
-; X86:       exit:
-; X86-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
-; X86:       if.then135.i:
-; X86-NEXT:    [[TMP4:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP4]], <2 x i32> <i32 2, i32 1>
-; X86-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
-; X86-NEXT:    [[TMP7:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP6]]
-; X86-NEXT:    [[TMP8:%.*]] = fmul fast <2 x double> [[TMP7]], zeroinitializer
-; X86-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP8]], zeroinitializer
-; X86-NEXT:    br label [[IF_END209_I]]
-; X86:       if.end209.i:
-; X86-NEXT:    [[TMP10:%.*]] = phi <2 x double> [ [[TMP9]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
-; X86-NEXT:    ret void
-;
-; AARCH64-LABEL: @test(
-; AARCH64-NEXT:  entry:
-; AARCH64-NEXT:    br label [[BODY:%.*]]
-; AARCH64:       body:
-; AARCH64-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
-; AARCH64-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
-; AARCH64-NEXT:    [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
-; AARCH64-NEXT:    [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
-; AARCH64-NEXT:    [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
-; AARCH64-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
-; AARCH64-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
-; AARCH64:       exit:
-; AARCH64-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
-; AARCH64:       if.then135.i:
-; AARCH64-NEXT:    [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
-; AARCH64-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP1]], <2 x i32> <i32 2, i32 1>
-; AARCH64-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
-; AARCH64-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]]
-; AARCH64-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer
-; AARCH64-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer
-; AARCH64-NEXT:    br label [[IF_END209_I]]
-; AARCH64:       if.end209.i:
-; AARCH64-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
-; AARCH64-NEXT:    ret void
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP8]]
+; CHECK-NEXT:    [[ADD8_I_I:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP9]])
+; CHECK-NEXT:    [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
+; CHECK-NEXT:    br i1 false, label [[BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
+; CHECK:       if.then135.i:
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> <i1 poison, i1 false>, <2 x i1> [[TMP1]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    br label [[IF_END209_I]]
+; CHECK:       if.end209.i:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %body

diff  --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
index b5d74f0b91ab8..1e4b598d9fe92 100644
--- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll
@@ -1,45 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH86 %}
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778) {
-; X86-LABEL: @test(
-; X86-NEXT:  entry:
-; X86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; X86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; X86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; X86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; X86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
-; X86-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
-; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
-; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
-; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
-; X86-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
-; X86-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
-; X86-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
-; X86-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
-; X86-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AARCH86-LABEL: @test(
-; AARCH86-NEXT:  entry:
-; AARCH86-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
-; AARCH86-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
-; AARCH86-NEXT:    [[I1773:%.*]] = fmul fast double [[I1772]], [[I1754:%.*]]
-; AARCH86-NEXT:    [[I1782:%.*]] = fmul fast double [[I1754]], [[I1754]]
-; AARCH86-NEXT:    [[I1783:%.*]] = fadd fast double [[I1782]], 1.000000e+00
-; AARCH86-NEXT:    [[I1787:%.*]] = fmul fast double [[I1778:%.*]], [[I1754]]
-; AARCH86-NEXT:    [[I1788:%.*]] = fadd fast double [[I1787]], 1.000000e+00
-; AARCH86-NEXT:    [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]]
-; AARCH86-NEXT:    [[I1793:%.*]] = fadd fast double [[I1792]], 1.000000e+00
-; AARCH86-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
-; AARCH86-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
-; AARCH86-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781]]
-; AARCH86-NEXT:    [[TMP4:%.*]] = fadd fast double [[I1773]], [[I1797]]
-; AARCH86-NEXT:    [[I1976:%.*]] = insertelement <4 x double> zeroinitializer, double [[I1783]], i64 0
-; AARCH86-NEXT:    [[I1982:%.*]] = insertelement <4 x double> [[I1976]], double [[I1788]], i64 1
-; AARCH86-NEXT:    [[I1988:%.*]] = insertelement <4 x double> [[I1982]], double [[I1793]], i64 2
-; AARCH86-NEXT:    [[I1994:%.*]] = insertelement <4 x double> [[I1988]], double [[TMP4]], i64 3
-; AARCH86-NEXT:    ret <4 x double> [[I1994]]
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54
+; CHECK-NEXT:    [[I1772:%.*]] = load double, ptr [[I1771]], align 8
+; CHECK-NEXT:    [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55
+; CHECK-NEXT:    [[I1796:%.*]] = load double, ptr [[I1795]], align 8
+; CHECK-NEXT:    [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double poison>, double [[I1797]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret <4 x double> [[TMP7]]
 ;
 entry:
   %i1771 = getelementptr inbounds double, ptr %p2, i64 54