[llvm] 005f0fa - [SLP]Improved/fixed FMAD support in reductions

Tue Sep 2 13:10:09 PDT 2025

Author: Alexey Bataev
Date: 2025-09-02T13:09:57-07:00
New Revision: 005f0fa40ed3fe4657322f95916577c2f855719b

URL: https://github.com/llvm/llvm-project/commit/005f0fa40ed3fe4657322f95916577c2f855719b
DIFF: https://github.com/llvm/llvm-project/commit/005f0fa40ed3fe4657322f95916577c2f855719b.diff

LOG: [SLP]Improved/fixed FMAD support in reductions

In the initial patch for FMAD, potential FMAD nodes were completely
excluded from the reduction analysis for the smaller patch. But it may
cause regressions.

This patch adds better detection of scalar FMAD reduction operations and
tries to correctly calculate the costs of the FMAD reduction operations
(also, excluding the costs of the scalar fmuls) and split reduction
operations, combined with regular FMADs.

Fixed the handling for reduced values with many uses.

Reviewers: RKSimon, gregbedwell, hiraditya

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/152787

Added: 
    llvm/test/Transforms/SLPVectorizer/AArch64/many-uses-fma-candidate.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
    llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
    llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 040e2dafb56a6..9c6099c2da307 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23842,7 +23842,8 @@ class HorizontalReduction {
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
-                     const TargetLibraryInfo &TLI, AssumptionCache *AC) {
+                     const TargetLibraryInfo &TLI, AssumptionCache *AC,
+                     DominatorTree &DT) {
     constexpr unsigned RegMaxNumber = 4;
     constexpr unsigned RedValsMaxNumber = 128;
     // If there are a sufficient number of reduction values, reduce
@@ -24241,7 +24242,7 @@ class HorizontalReduction {
 
         // Estimate cost.
         InstructionCost ReductionCost =
-            getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
+            getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
         InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
         LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
                           << " for reduction\n");
@@ -24546,7 +24547,9 @@ class HorizontalReduction {
   InstructionCost getReductionCost(TargetTransformInfo *TTI,
                                    ArrayRef<Value *> ReducedVals,
                                    bool IsCmpSelMinMax, FastMathFlags FMF,
-                                   const BoUpSLP &R) {
+                                   const BoUpSLP &R, DominatorTree &DT,
+                                   const DataLayout &DL,
+                                   const TargetLibraryInfo &TLI) {
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     Type *ScalarTy = ReducedVals.front()->getType();
     unsigned ReduxWidth = ReducedVals.size();
@@ -24571,6 +24574,22 @@ class HorizontalReduction {
         for (User *U : RdxVal->users()) {
           auto *RdxOp = cast<Instruction>(U);
           if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+            if (RdxKind == RecurKind::FAdd) {
+              InstructionCost FMACost = canConvertToFMA(
+                  RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
+              if (FMACost.isValid()) {
+                LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
+                if (auto *I = dyn_cast<Instruction>(RdxVal)) {
+                  // Also, exclude scalar fmul cost.
+                  InstructionCost FMulCost =
+                      TTI->getInstructionCost(I, CostKind);
+                  LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
+                  FMACost -= FMulCost;
+                }
+                ScalarCost += FMACost;
+                continue;
+              }
+            }
             ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
             continue;
           }
@@ -24635,8 +24654,45 @@ class HorizontalReduction {
           auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
               std::make_pair(RedTy, true));
           VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
-          VectorCost +=
-              TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
+          InstructionCost FMACost = InstructionCost::getInvalid();
+          if (RdxKind == RecurKind::FAdd) {
+            // Check if the reduction operands can be converted to FMA.
+            SmallVector<Value *> Ops;
+            FastMathFlags FMF;
+            FMF.set();
+            for (Value *RdxVal : ReducedVals) {
+              if (!RdxVal->hasOneUse()) {
+                Ops.clear();
+                break;
+              }
+              if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
+                FMF &= FPCI->getFastMathFlags();
+              Ops.push_back(RdxVal->user_back());
+            }
+            if (!Ops.empty()) {
+              FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
+                                        *TTI, TLI);
+              if (FMACost.isValid()) {
+                // Calculate actual FMAD cost.
+                IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
+                                            {RVecTy, RVecTy, RVecTy}, FMF);
+                FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
+
+                LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
+                // Also, exclude vector fmul cost.
+                InstructionCost FMulCost = TTI->getArithmeticInstrCost(
+                    Instruction::FMul, RVecTy, CostKind);
+                LLVM_DEBUG(dbgs()
+                           << "Minus vector FMul cost: " << FMulCost << "\n");
+                FMACost -= FMulCost;
+              }
+            }
+          }
+          if (FMACost.isValid())
+            VectorCost += FMACost;
+          else
+            VectorCost +=
+                TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
           if (RType != RedTy) {
             unsigned Opcode = Instruction::Trunc;
             if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25304,7 +25360,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     HorizontalReduction HorRdx;
     if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
-    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
+    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
   };
   auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
     if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25449,7 +25505,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
     if (RedCost >= ScalarCost)
       return false;
 
-    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
   };
   if (Candidates.size() == 1)
     return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/many-uses-fma-candidate.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/many-uses-fma-candidate.ll
new file mode 100644
index 0000000000000..fae8110ccf04f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/many-uses-fma-candidate.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define double @test(ptr %0, ptr %1) {
+; CHECK-LABEL: define double @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 144
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 232
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul reassoc nsz <4 x double> [[TMP4]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[TMP10:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul double [[TMP5]], 2.000000e+00
+; CHECK-NEXT:    [[OP_RDX5:%.*]] = fadd reassoc nsz double [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[OP_RDX6:%.*]] = fadd reassoc nsz double [[OP_RDX5]], [[TMP6]]
+; CHECK-NEXT:    [[OP_RDX7:%.*]] = fadd reassoc nsz double [[OP_RDX6]], [[TMP7]]
+; CHECK-NEXT:    [[OP_RDX8:%.*]] = fadd reassoc nsz double [[OP_RDX7]], [[TMP10]]
+; CHECK-NEXT:    ret double [[OP_RDX8]]
+;
+entry:
+  %2 = getelementptr i8, ptr %1, i64 144
+  %3 = getelementptr i8, ptr %1, i64 152
+  %4 = getelementptr i8, ptr %1, i64 160
+  %5 = getelementptr i8, ptr %1, i64 168
+  %6 = getelementptr i8, ptr %1, i64 232
+  %7 = load double, ptr %2, align 8
+  %8 = load double, ptr %3, align 8
+  %9 = fadd reassoc nsz double %8, %7
+  %10 = load double, ptr %4, align 8
+  %11 = fadd reassoc nsz double %10, %9
+  %12 = load double, ptr %5, align 8
+  %13 = fadd reassoc nsz double %12, %11
+  %14 = load double, ptr %0, align 8
+  %15 = fadd reassoc nsz double %14, %13
+  %16 = fadd reassoc nsz double %14, %15
+  %17 = load double, ptr %6, align 8
+  %18 = fadd reassoc nsz double %17, %16
+  %19 = load double, ptr %1, align 8
+  %20 = fadd reassoc nsz double %19, %18
+  %21 = load double, ptr %0, align 8
+  %22 = fadd reassoc nsz double %21, %20
+  ret double %22
+}

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 9cc6b8739b20f..9d2e22bb454e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -709,34 +709,25 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
 
 
 define double @dot_product_fp64(ptr %a, ptr %b) {
-; NON-POW2-LABEL: @dot_product_fp64(
-; NON-POW2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
-; NON-POW2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
-; NON-POW2-NEXT:    ret double [[TMP4]]
-;
-; POW2-ONLY-LABEL: @dot_product_fp64(
-; POW2-ONLY-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
-; POW2-ONLY-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
-; POW2-ONLY-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
-; POW2-ONLY-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
-; POW2-ONLY-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
-; POW2-ONLY-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
-; POW2-ONLY-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret double [[ADD_1]]
+; CHECK-LABEL: @dot_product_fp64(
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
+; CHECK-NEXT:    [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
+; CHECK-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
+; CHECK-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
+; CHECK-NEXT:    [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret double [[ADD_1]]
 ;
   %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
   %l.a.0 = load double, ptr %gep.a.0, align 4
@@ -793,21 +784,13 @@ entry:
 }
 
 define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
-; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; NON-POW2-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
-; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
-; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
-; NON-POW2-NEXT:    [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
-; NON-POW2-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
-; NON-POW2-NEXT:    ret float [[TMP5]]
-;
-; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; POW2-ONLY-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
-; POW2-ONLY-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
-; POW2-ONLY-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; POW2-ONLY-NEXT:    ret float [[ADD_1]]
+; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; CHECK-NEXT:    ret float [[ADD_1]]
 ;
   %mul.0 = fmul fast float %a, 10.0
   %mul.1 = fmul fast float %b, 10.0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 1922e935cee4b..f921278cdecf3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,19 +10,24 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
+; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
-; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
+; CHECK-NEXT:    [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
+; CHECK-NEXT:    [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]])
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]])
+; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
+; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
index 82fb5a46fee7c..8b65461028fb1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
 ; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
-; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=AVX
+; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s  --check-prefixes=AVX2
 
 ; This test checks for a case when a horizontal reduction of floating-point
 ; adds may look profitable, but is not because it eliminates generation of
@@ -26,13 +26,27 @@ define void @hr() {
 ; AVX:       loop:
 ; AVX-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
 ; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
-; AVX-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
-; AVX-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
-; AVX-NEXT:    [[ADD3]] = fadd fast double [[TMP3]], [[PHI0]]
+; AVX-NEXT:    [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
+; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]]
+; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]]
+; AVX-NEXT:    [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]]
 ; AVX-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
 ; AVX:       exit:
 ; AVX-NEXT:    ret void
+;
+; AVX2-LABEL: @hr(
+; AVX2-NEXT:    br label [[LOOP:%.*]]
+; AVX2:       loop:
+; AVX2-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
+; AVX2-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
+; AVX2-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
+; AVX2-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; AVX2-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
+; AVX2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+; AVX2:       exit:
+; AVX2-NEXT:    ret void
 ;
   br label %loop
 
@@ -70,12 +84,24 @@ define double @hr_or_mul() {
 ;
 ; AVX-LABEL: @hr_or_mul(
 ; AVX-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
-; AVX-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; AVX-NEXT:    [[TMP4:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
 ; AVX-NEXT:    [[ADD3:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
-; AVX-NEXT:    ret double [[ADD3]]
+; AVX-NEXT:    [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
+; AVX-NEXT:    [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD3]]
+; AVX-NEXT:    [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
+; AVX-NEXT:    [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
+; AVX-NEXT:    [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
+; AVX-NEXT:    [[ADD4:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
+; AVX-NEXT:    ret double [[ADD4]]
+;
+; AVX2-LABEL: @hr_or_mul(
+; AVX2-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; AVX2-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
+; AVX2-NEXT:    ret double [[OP_RDX]]
 ;
   %cvt0 = uitofp i16 3 to double
   %mul0 = fmul fast double 7.000000e+00, %cvt0