[llvm] [SystemZ] SLP reductions: cost functions of reductions and scalarization (PR #112491)

Wed Oct 16 00:30:58 PDT 2024

https://github.com/JonPsson1 created https://github.com/llvm/llvm-project/pull/112491

This patch has two separate commits that are kind of independent, even though the second one benefits from the first one.

1. Improve the SLPVectorizer getGatherCost() to take into account free vector element loads, as well as related SZTTI cost functions.

2. Implement SystemZ  getArithmeticReductionCost() and getMinMaxReductionCost(). So far only for FP.

The (preliminary) results on benchmarks are not very significant, but even so this should in theory be a general improvement resulting in slightly increased vectorization (SLP). Integer reduction costs should also be addressed, and probably some more benchmarking should be done before committing.

@dominik-steenken (not sure why, but I could not add you as a reviewer)

>From b75e1b5ccc83ecde2a0b5d91b6d3426ebe92a471 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:51:20 +0200
Subject: [PATCH 1/2] Improvements to vector elements insertion costs.

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 67 ++++++++++----
 .../SystemZ/SystemZTargetTransformInfo.h      |  4 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 14 ++-
 .../SystemZ/vec-elt-insertion.ll              | 88 +++++++++++++++++++
 4 files changed, 154 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 7e5728c40950ad..b98db455c2dd42 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
+InstructionCost SystemZTTIImpl::
+getScalarizationOverhead(VectorType *Ty,
+                         const APInt &DemandedElts,
+                         bool Insert, bool Extract,
+                         TTI::TargetCostKind CostKind) {
+  unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+  InstructionCost Cost = 0;
+
+  if (Insert && Ty->isIntOrIntVectorTy(64)) {
+    // VLVGP will insert two GPRs with one instruction.
+    InstructionCost CurrVectorCost = 0;
+    for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+      if (DemandedElts[Idx])
+        ++CurrVectorCost;
+      if (Idx % 2 == 1) {
+        Cost += std::min(InstructionCost(1), CurrVectorCost);
+        CurrVectorCost = 0;
+      }
+    }
+    Insert = false;
+  }
+
+  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+                                          Extract, CostKind);
+  return Cost;
+}
+
 // Return the bit size for the scalar type or vector element
 // type. getScalarSizeInBits() returns 0 for a pointer type.
 static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (DivRemConst) {
       SmallVector<Type *> Tys(Args.size(), Ty);
       return VF * DivMulSeqCost +
-             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+             BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
     }
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
         SmallVector<Type *> Tys(Args.size(), Ty);
         InstructionCost Cost =
             (VF * ScalarCost) +
-            getScalarizationOverhead(VTy, Args, Tys, CostKind);
+            BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
       SmallVector<Type *> Tys(Args.size(), Ty);
-      InstructionCost Cost = (VF * LIBCALL_COST) +
-                             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+      InstructionCost Cost =
+          (VF * LIBCALL_COST) +
+          BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                          NeedsExtracts, CostKind);
-      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
-                                          /*Extract*/ false, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                 NeedsExtracts, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
+                                                 /*Extract*/ false, CostKind);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
         return VF /*ldxbr/lexbr*/ +
-               getScalarizationOverhead(DstVecTy, /*Insert*/ true,
-                                        /*Extract*/ false, CostKind);
+               BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+                                               /*Extract*/ false, CostKind);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                           /*Extract*/ true, CostKind);
+      return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                  /*Extract*/ true, CostKind);
     }
   }
 
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index, Value *Op0,
                                                    Value *Op1) {
-  // vlvgp will insert two grs into a vector register, so only count half the
-  // number of instructions.
-  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
-    return ((Index % 2 == 0) ? 1 : 0);
+  if (Opcode == Instruction::InsertElement) {
+    // Vector Element Load.
+    if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+      return 0;
+
+    // vlvgp will insert two grs into a vector register, so count half the
+    // number of instructions as an estimate when we don't have the full
+    // picture (as in getScalarizationOverhead()).
+    if (Val->isIntOrIntVectorTy(64))
+      return ((Index % 2 == 0) ? 1 : 0);
+  }
 
   if (Opcode == Instruction::ExtractElement) {
     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 8cc71a6c528f82..80294ada23c3a9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
   bool LSRWithInstrQueries() { return true; }
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 595aed2cab182b..a4fedb34785e16 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3025,8 +3025,8 @@ class BoUpSLP {
       unsigned NumParts, bool ForOrder = false);
 
   /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to extract the values from the
-  /// roots. This method calculates the cost of extracting the values.
+  /// this subtree gets vectorized, we may need to insert the values from the
+  /// roots. This method calculates the cost of inserting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
@@ -12897,7 +12897,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
               TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
               I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
     } else {
-      Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+      // Add insertion costs for all elements, but not for loads that can be
+      // loaded directly into a vector element for free.
+      APInt FreeEltLoads = APInt::getZero(VL.size());
+      if (TTI->supportsEfficientVectorElementLoadStore())
+        for (unsigned I = 0, E = VL.size(); I < E; ++I)
+          if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
+            FreeEltLoads.setBit(I);
+      APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
+      Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
                                            /*Insert*/ true,
                                            /*Extract*/ false, CostKind);
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
new file mode 100644
index 00000000000000..7e64b42c52aa94
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -0,0 +1,88 @@
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
+;
+; Test functions that (at least currently) only gets vectorized if the
+; insertion cost for an element load is counted as free.
+
+; This function needs the free element load to be recognized in SLP
+; getGatherCost().
+define void @fun0(ptr nocapture %0, double %1) {
+; CHECK-LABEL: define void @fun0(
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+
+  %3 = fmul double %1, 2.000000e+00
+  %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
+  %5 = tail call double @llvm.fmuladd.f64(double %3, double %3, double %4)
+  %sqrt1 = tail call double @llvm.sqrt.f64(double %5)
+  %6 = load double, ptr %0, align 8
+  %7 = fmul double %6, 2.000000e+00
+  %8 = tail call double @llvm.fmuladd.f64(double %7, double %7, double 0.000000e+00)
+  %9 = tail call double @llvm.fmuladd.f64(double %7, double %7, double %8)
+  %sqrt = tail call double @llvm.sqrt.f64(double %9)
+  %10 = fadd double %sqrt1, %sqrt
+  store double %10, ptr %0, align 8
+  ret void
+}
+
+
+; This function needs the element-load to be recognized in SystemZ
+; getVectorInstrCost().
+define void @fun1(double %0) local_unnamed_addr {
+; CHECK-LABEL: define void @fun1(
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    %14 = fcmp olt <2 x double> %13, %2
+
+  br label %2
+
+2:
+  %3 = phi double [ poison, %1 ], [ poison, %2 ]
+  %4 = phi double [ undef, %1 ], [ poison, %2 ]
+  %5 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %6 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %7 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %8 = phi double [ 0.000000e+00, %1 ], [ %21, %2 ]
+  %9 = fsub double 0.000000e+00, %8
+  %10 = fsub double 0.000000e+00, %7
+  %11 = fmul double %9, 0.000000e+00
+  %12 = fmul double %10, 0.000000e+00
+  %13 = fsub double 0.000000e+00, %6
+  %14 = fsub double 0.000000e+00, %5
+  %15 = tail call double @llvm.fmuladd.f64(double %13, double %13, double %11)
+  %16 = tail call double @llvm.fmuladd.f64(double %14, double %14, double %12)
+  %17 = fsub double 0.000000e+00, %4
+  %18 = fsub double 0.000000e+00, %3
+  %19 = tail call double @llvm.fmuladd.f64(double %17, double %17, double %15)
+  %20 = tail call double @llvm.fmuladd.f64(double %18, double %18, double %16)
+  %21 = load double, ptr null, align 8
+  %22 = fcmp olt double %19, %0
+  %23 = fcmp olt double %20, 0.000000e+00
+  %24 = or i1 %23, %22
+  br label %2
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; This should *not* be vectorized as the insertion into the vector isn't free,
+; which is recognized in SystemZTTImpl::getScalarizationOverhead().
+define void @fun2(ptr %0, ptr %Dst) {
+; CHECK-LABEL: define void @fun2(
+; CHECK-NOT: store <2 x i64>
+  %3 = load i64, ptr %0, align 8
+  %4 = icmp eq i64 %3, 0
+  br i1 %4, label %5, label %6
+
+5:
+  ret void
+
+6:
+  %7 = getelementptr i8, ptr %Dst, i64 24
+  store i64 %3, ptr %7, align 8
+  %8 = getelementptr i8, ptr %Dst, i64 16
+  store i64 0, ptr %8, align 8
+  br label %5
+}

>From 532984d89fec050a4147e4e73b1ba421d5dd57fd Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:52:37 +0200
Subject: [PATCH 2/2] FP reduction cost functions (for SLP)

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  51 +++
 .../SystemZ/SystemZTargetTransformInfo.h      |   7 +
 .../CostModel/SystemZ/vector-reductions-fp.ll | 131 ++++++
 .../SLPVectorizer/SystemZ/reductions-fadd.ll  | 188 ++++++++
 .../SystemZ/reductions-fmin-fmax.ll           | 411 ++++++++++++++++++
 .../SLPVectorizer/SystemZ/reductions-fmul.ll  | 188 ++++++++
 6 files changed, 976 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b98db455c2dd42..9ab5a77280e6f9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1388,6 +1388,57 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
+// EXPERIMENTAL
+static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
+
+InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
+  unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
+  InstructionCost Cost = 0;
+  Cost += NumVec - 1;        // Full vector operations.
+  Cost += NumEltsPerVecReg;  // Last vector scalar operations.
+  return Cost;
+}
+
+InstructionCost
+SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                           std::optional<FastMathFlags> FMF,
+                                           TTI::TargetCostKind CostKind) {
+  if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
+      (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+    // // EXPERIMENTAL: better to not vectorize small vectors?:
+    // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+    // if (NumElts <= REDLIM)
+    //   return NumVectors * 8;  // => MachineCombiner
+
+    // // EXPERIMENTAL: Return a low cost to enable heavily.
+    // return NumVectors / 2;
+
+    return getFPReductionCost(NumVectors, ScalarBits);
+  }
+
+  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+}
+
+InstructionCost
+SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                       FastMathFlags FMF,
+                                       TTI::TargetCostKind CostKind) {
+  if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+    // // EXPERIMENTAL: Return a low cost to enable heavily.
+    // return NumVectors / 2;
+
+    return getFPReductionCost(NumVectors, ScalarBits);
+  }
+
+  return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
+}
+
 static int
 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 80294ada23c3a9..b65e75ab98814c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -129,6 +129,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             std::optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
+  InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                         FastMathFlags FMF,
+                                         TTI::TargetCostKind CostKind);
+
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
new file mode 100644
index 00000000000000..055c25298d847e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
+; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
+
+define void @fadd_reductions() {
+; Z15-LABEL: 'fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fadd_reductions() {
+; Z15-LABEL: 'fast_fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+
+  ret void
+}
+
+define void @fmul_reductions() {
+; Z15-LABEL: 'fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fmul_reductions() {
+; Z15-LABEL: 'fast_fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+
+  ret void
+}
+
+define void @fmin_reductions() {
+; Z15-LABEL: 'fmin_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+define void @fmax_reductions() {
+; Z15-LABEL: 'fmax_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
new file mode 100644
index 00000000000000..fa0587f1da931b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+;
+; Test vectorization and reassociation of fadd operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fadd_double_4_addends_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  ret double %add5
+}
+
+define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
+  ret double %add13
+}
+
+define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %add3 = fadd reassoc nsz arcp contract afn float %add, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
+  ret float %add29
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
new file mode 100644
index 00000000000000..5a466178ba786b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmin/fmax operations. Vectorization
+; is more profitable if the loads are also vectorizable.
+
+define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.minnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.minnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.minnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.minnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.maxnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
new file mode 100644
index 00000000000000..e08b38c69a840d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmul operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_4_factors_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  ret double %mul5
+}
+
+define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
+  ret double %mul13
+}
+
+define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
+; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
+  ret float %mul29
+}