[llvm] [SystemZ] SLP reductions: cost functions of reductions and scalarization (PR #112491)

Fri Oct 25 09:08:05 PDT 2024

https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/112491

>From 594147fd8e3af327a426862c37a85eeb42bb1870 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:51:20 +0200
Subject: [PATCH 1/7] Improvements to vector elements insertion costs.

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 67 ++++++++++++++-----
 .../SystemZ/SystemZTargetTransformInfo.h      |  4 ++
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 14 +++-
 .../SystemZ/vec-elt-insertion.ll              | 66 ++++--------------
 4 files changed, 79 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 7e5728c40950ad..b98db455c2dd42 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
+InstructionCost SystemZTTIImpl::
+getScalarizationOverhead(VectorType *Ty,
+                         const APInt &DemandedElts,
+                         bool Insert, bool Extract,
+                         TTI::TargetCostKind CostKind) {
+  unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+  InstructionCost Cost = 0;
+
+  if (Insert && Ty->isIntOrIntVectorTy(64)) {
+    // VLVGP will insert two GPRs with one instruction.
+    InstructionCost CurrVectorCost = 0;
+    for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+      if (DemandedElts[Idx])
+        ++CurrVectorCost;
+      if (Idx % 2 == 1) {
+        Cost += std::min(InstructionCost(1), CurrVectorCost);
+        CurrVectorCost = 0;
+      }
+    }
+    Insert = false;
+  }
+
+  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+                                          Extract, CostKind);
+  return Cost;
+}
+
 // Return the bit size for the scalar type or vector element
 // type. getScalarSizeInBits() returns 0 for a pointer type.
 static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (DivRemConst) {
       SmallVector<Type *> Tys(Args.size(), Ty);
       return VF * DivMulSeqCost +
-             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+             BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
     }
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
         SmallVector<Type *> Tys(Args.size(), Ty);
         InstructionCost Cost =
             (VF * ScalarCost) +
-            getScalarizationOverhead(VTy, Args, Tys, CostKind);
+            BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
       SmallVector<Type *> Tys(Args.size(), Ty);
-      InstructionCost Cost = (VF * LIBCALL_COST) +
-                             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+      InstructionCost Cost =
+          (VF * LIBCALL_COST) +
+          BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                          NeedsExtracts, CostKind);
-      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
-                                          /*Extract*/ false, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                 NeedsExtracts, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
+                                                 /*Extract*/ false, CostKind);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
         return VF /*ldxbr/lexbr*/ +
-               getScalarizationOverhead(DstVecTy, /*Insert*/ true,
-                                        /*Extract*/ false, CostKind);
+               BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+                                               /*Extract*/ false, CostKind);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                           /*Extract*/ true, CostKind);
+      return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                  /*Extract*/ true, CostKind);
     }
   }
 
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index, Value *Op0,
                                                    Value *Op1) {
-  // vlvgp will insert two grs into a vector register, so only count half the
-  // number of instructions.
-  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
-    return ((Index % 2 == 0) ? 1 : 0);
+  if (Opcode == Instruction::InsertElement) {
+    // Vector Element Load.
+    if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+      return 0;
+
+    // vlvgp will insert two grs into a vector register, so count half the
+    // number of instructions as an estimate when we don't have the full
+    // picture (as in getScalarizationOverhead()).
+    if (Val->isIntOrIntVectorTy(64))
+      return ((Index % 2 == 0) ? 1 : 0);
+  }
 
   if (Opcode == Instruction::ExtractElement) {
     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 8cc71a6c528f82..80294ada23c3a9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
   bool LSRWithInstrQueries() { return true; }
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2afd02dae3a8b8..17c82936408961 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3033,8 +3033,8 @@ class BoUpSLP {
       unsigned NumParts, bool ForOrder = false);
 
   /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to extract the values from the
-  /// roots. This method calculates the cost of extracting the values.
+  /// this subtree gets vectorized, we may need to insert the values from the
+  /// roots. This method calculates the cost of inserting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
@@ -13013,7 +13013,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
               TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
               I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
     } else {
-      Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+      // Add insertion costs for all elements, but not for loads that can be
+      // loaded directly into a vector element for free.
+      APInt FreeEltLoads = APInt::getZero(VL.size());
+      if (TTI->supportsEfficientVectorElementLoadStore())
+        for (unsigned I = 0, E = VL.size(); I < E; ++I)
+          if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
+            FreeEltLoads.setBit(I);
+      APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
+      Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
                                            /*Insert*/ true,
                                            /*Extract*/ false, CostKind);
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index eb8dd72e0304d9..7e64b42c52aa94 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,8 +1,4 @@
-; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
-; RUN:   -pass-remarks-output=%t | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=REMARK %s
-;
-; NB! This is a pre-commit version (for #112491) with current codegen and remarks.
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
 ;
 ; Test functions that (at least currently) only gets vectorized if the
 ; insertion cost for an element load is counted as free.
@@ -11,19 +7,8 @@
 ; getGatherCost().
 define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
-; CHECK:         fmul double
-; CHECK:         call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.sqrt.f64(
-; CHECK:         fmul double
-; CHECK:         call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.sqrt.f64(
-;
-; REMARK-LABEL: Function: fun0
-; REMARK: Args:
-; REMARK-NEXT: - String:          'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost:            '0'
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
 
   %3 = fmul double %1, 2.000000e+00
   %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -39,36 +24,18 @@ define void @fun0(ptr nocapture %0, double %1) {
   ret void
 }
 
+
 ; This function needs the element-load to be recognized in SystemZ
 ; getVectorInstrCost().
-define void @fun1(double %0) {
+define void @fun1(double %0) local_unnamed_addr {
 ; CHECK-LABEL: define void @fun1(
-; CHECK:         phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fmul double
-; CHECK-NEXT:    fmul double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK:         fcmp olt double
-; CHECK-NEXT:    fcmp olt double
-; CHECK-NEXT:    or i1
-;
-; REMARK-LABEL: Function: fun1
-; REMARK: Args:
-; REMARK:      - String:          'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost:            '0'
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    %14 = fcmp olt <2 x double> %13, %2
 
   br label %2
 
@@ -104,14 +71,7 @@ declare double @llvm.fmuladd.f64(double, double, double)
 ; which is recognized in SystemZTTImpl::getScalarizationOverhead().
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
-; CHECK: insertelement
-; CHECK: store <2 x i64>
-;
-; REMARK-LABEL: Function: fun2
-; REMARK: Args:
-; REMARK-NEXT: - String:          'Stores SLP vectorized with cost '
-; REMARK-NEXT: - Cost:            '-1'
-
+; CHECK-NOT: store <2 x i64>
   %3 = load i64, ptr %0, align 8
   %4 = icmp eq i64 %3, 0
   br i1 %4, label %5, label %6

>From f2816a5870d7329ce3345b2597da8929915246f3 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:52:37 +0200
Subject: [PATCH 2/7] FP reduction cost functions (for SLP)

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  51 +++
 .../SystemZ/SystemZTargetTransformInfo.h      |   7 +
 .../CostModel/SystemZ/vector-reductions-fp.ll | 131 ++++++
 .../SLPVectorizer/SystemZ/reductions-fadd.ll  | 188 ++++++++
 .../SystemZ/reductions-fmin-fmax.ll           | 411 ++++++++++++++++++
 .../SLPVectorizer/SystemZ/reductions-fmul.ll  | 188 ++++++++
 6 files changed, 976 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b98db455c2dd42..9ab5a77280e6f9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1388,6 +1388,57 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
+// EXPERIMENTAL
+static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
+
+InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
+  unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
+  InstructionCost Cost = 0;
+  Cost += NumVec - 1;        // Full vector operations.
+  Cost += NumEltsPerVecReg;  // Last vector scalar operations.
+  return Cost;
+}
+
+InstructionCost
+SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                           std::optional<FastMathFlags> FMF,
+                                           TTI::TargetCostKind CostKind) {
+  if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
+      (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+    // // EXPERIMENTAL: better to not vectorize small vectors?:
+    // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+    // if (NumElts <= REDLIM)
+    //   return NumVectors * 8;  // => MachineCombiner
+
+    // // EXPERIMENTAL: Return a low cost to enable heavily.
+    // return NumVectors / 2;
+
+    return getFPReductionCost(NumVectors, ScalarBits);
+  }
+
+  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+}
+
+InstructionCost
+SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                       FastMathFlags FMF,
+                                       TTI::TargetCostKind CostKind) {
+  if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+    // // EXPERIMENTAL: Return a low cost to enable heavily.
+    // return NumVectors / 2;
+
+    return getFPReductionCost(NumVectors, ScalarBits);
+  }
+
+  return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
+}
+
 static int
 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 80294ada23c3a9..b65e75ab98814c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -129,6 +129,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             std::optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
+  InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                         FastMathFlags FMF,
+                                         TTI::TargetCostKind CostKind);
+
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
new file mode 100644
index 00000000000000..055c25298d847e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
+; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
+
+define void @fadd_reductions() {
+; Z15-LABEL: 'fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fadd_reductions() {
+; Z15-LABEL: 'fast_fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+
+  ret void
+}
+
+define void @fmul_reductions() {
+; Z15-LABEL: 'fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fmul_reductions() {
+; Z15-LABEL: 'fast_fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+
+  ret void
+}
+
+define void @fmin_reductions() {
+; Z15-LABEL: 'fmin_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+define void @fmax_reductions() {
+; Z15-LABEL: 'fmax_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
new file mode 100644
index 00000000000000..fa0587f1da931b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+;
+; Test vectorization and reassociation of fadd operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fadd_double_4_addends_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  ret double %add5
+}
+
+define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
+  ret double %add13
+}
+
+define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %add3 = fadd reassoc nsz arcp contract afn float %add, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
+  ret float %add29
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
new file mode 100644
index 00000000000000..5a466178ba786b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmin/fmax operations. Vectorization
+; is more profitable if the loads are also vectorizable.
+
+define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.minnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.minnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.minnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.minnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.maxnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
new file mode 100644
index 00000000000000..e08b38c69a840d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmul operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_4_factors_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  ret double %mul5
+}
+
+define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
+  ret double %mul13
+}
+
+define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
+; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
+  ret float %mul29
+}

>From e24af9edce86ebe64447cc6520b55a863cf367cf Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 21 Oct 2024 17:16:24 +0200
Subject: [PATCH 3/7] Revert "FP reduction cost functions (for SLP)"

Wait with this and first evaluate the scalarization costs separately.
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  51 ---
 .../SystemZ/SystemZTargetTransformInfo.h      |   7 -
 .../CostModel/SystemZ/vector-reductions-fp.ll | 131 ------
 .../SLPVectorizer/SystemZ/reductions-fadd.ll  | 188 --------
 .../SystemZ/reductions-fmin-fmax.ll           | 411 ------------------
 .../SLPVectorizer/SystemZ/reductions-fmul.ll  | 188 --------
 6 files changed, 976 deletions(-)
 delete mode 100644 llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 9ab5a77280e6f9..b98db455c2dd42 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1388,57 +1388,6 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
-// EXPERIMENTAL
-static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
-
-InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
-  unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
-  InstructionCost Cost = 0;
-  Cost += NumVec - 1;        // Full vector operations.
-  Cost += NumEltsPerVecReg;  // Last vector scalar operations.
-  return Cost;
-}
-
-InstructionCost
-SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                                           std::optional<FastMathFlags> FMF,
-                                           TTI::TargetCostKind CostKind) {
-  if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
-      (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
-    unsigned NumVectors = getNumVectorRegs(Ty);
-    unsigned ScalarBits = Ty->getScalarSizeInBits();
-
-    // // EXPERIMENTAL: better to not vectorize small vectors?:
-    // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
-    // if (NumElts <= REDLIM)
-    //   return NumVectors * 8;  // => MachineCombiner
-
-    // // EXPERIMENTAL: Return a low cost to enable heavily.
-    // return NumVectors / 2;
-
-    return getFPReductionCost(NumVectors, ScalarBits);
-  }
-
-  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
-}
-
-InstructionCost
-SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
-                                       FastMathFlags FMF,
-                                       TTI::TargetCostKind CostKind) {
-  if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
-    unsigned NumVectors = getNumVectorRegs(Ty);
-    unsigned ScalarBits = Ty->getScalarSizeInBits();
-
-    // // EXPERIMENTAL: Return a low cost to enable heavily.
-    // return NumVectors / 2;
-
-    return getFPReductionCost(NumVectors, ScalarBits);
-  }
-
-  return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
-}
-
 static int
 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index b65e75ab98814c..80294ada23c3a9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -129,13 +129,6 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
-  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                                             std::optional<FastMathFlags> FMF,
-                                             TTI::TargetCostKind CostKind);
-  InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
-                                         FastMathFlags FMF,
-                                         TTI::TargetCostKind CostKind);
-
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
deleted file mode 100644
index 055c25298d847e..00000000000000
--- a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
-; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
-
-define void @fadd_reductions() {
-; Z15-LABEL: 'fadd_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
-  %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
-  %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
-  %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
-  %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-  ret void
-}
-
-define void @fast_fadd_reductions() {
-; Z15-LABEL: 'fast_fadd_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
-  %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
-  %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
-  %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
-  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-
-  ret void
-}
-
-define void @fmul_reductions() {
-; Z15-LABEL: 'fmul_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
-  %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
-  %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
-  %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
-  %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-  ret void
-}
-
-define void @fast_fmul_reductions() {
-; Z15-LABEL: 'fast_fmul_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
-  %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
-  %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
-  %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
-  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-
-  ret void
-}
-
-define void @fmin_reductions() {
-; Z15-LABEL: 'fmin_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-  %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-  %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-  %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-  %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-  ret void
-}
-
-define void @fmax_reductions() {
-; Z15-LABEL: 'fmax_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-  %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-  %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-  %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-  %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-  ret void
-}
-
-declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
-
-declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
-
-declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
-
-declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
deleted file mode 100644
index fa0587f1da931b..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN:   | FileCheck %s
-;
-; Test vectorization and reassociation of fadd operations. If the loads can
-; be vectorized, cases of fewer operands are also profitable to vectorize.
-
-define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fadd_double_4_addends_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
-  %1 = load double, ptr %arrayidx1, align 8
-  %add = fadd reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
-  %2 = load double, ptr %arrayidx2, align 8
-  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-  %3 = load double, ptr %arrayidx4, align 8
-  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
-  ret double %add5
-}
-
-define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x)  {
-; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
-; CHECK-NEXT:    ret double [[TMP16]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
-  %1 = load double, ptr %arrayidx1, align 8
-  %add = fadd reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
-  %2 = load double, ptr %arrayidx2, align 8
-  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
-  %3 = load double, ptr %arrayidx4, align 8
-  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
-  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
-  %4 = load double, ptr %arrayidx6, align 8
-  %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
-  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
-  %5 = load double, ptr %arrayidx8, align 8
-  %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
-  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
-  %6 = load double, ptr %arrayidx10, align 8
-  %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
-  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
-  %7 = load double, ptr %arrayidx12, align 8
-  %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
-  ret double %add13
-}
-
-define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x)  {
-; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
-; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
-; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
-; CHECK-NEXT:    ret float [[TMP32]]
-;
-entry:
-  %0 = load float, ptr %x, align 4
-  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
-  %1 = load float, ptr %arrayidx1, align 4
-  %add = fadd reassoc nsz arcp contract afn float %1, %0
-  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
-  %2 = load float, ptr %arrayidx2, align 4
-  %add3 = fadd reassoc nsz arcp contract afn float %add, %2
-  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
-  %3 = load float, ptr %arrayidx4, align 4
-  %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
-  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
-  %4 = load float, ptr %arrayidx6, align 4
-  %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
-  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
-  %5 = load float, ptr %arrayidx8, align 4
-  %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
-  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
-  %6 = load float, ptr %arrayidx10, align 4
-  %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
-  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
-  %7 = load float, ptr %arrayidx12, align 4
-  %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
-  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
-  %8 = load float, ptr %arrayidx14, align 4
-  %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
-  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
-  %9 = load float, ptr %arrayidx16, align 4
-  %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
-  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
-  %10 = load float, ptr %arrayidx18, align 4
-  %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
-  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
-  %11 = load float, ptr %arrayidx20, align 4
-  %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
-  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
-  %12 = load float, ptr %arrayidx22, align 4
-  %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
-  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
-  %13 = load float, ptr %arrayidx24, align 4
-  %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
-  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
-  %14 = load float, ptr %arrayidx26, align 4
-  %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
-  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
-  %15 = load float, ptr %arrayidx28, align 4
-  %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
-  ret float %add29
-}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
deleted file mode 100644
index 5a466178ba786b..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
+++ /dev/null
@@ -1,411 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN:   | FileCheck %s
-
-; Test vectorization and reassociation of fmin/fmax operations. Vectorization
-; is more profitable if the loads are also vectorizable.
-
-define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmin_double_4_nums_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 1
-  %g2 = getelementptr inbounds double, ptr %x, i64 2
-  %g3 = getelementptr inbounds double, ptr %x, i64 3
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
-  ret double %m3
-}
-
-define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
-; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
-; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
-; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
-; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
-; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
-; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
-; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
-; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
-; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
-; CHECK-NEXT:    ret double [[TMP17]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 2
-  %g2 = getelementptr inbounds double, ptr %x, i64 4
-  %g3 = getelementptr inbounds double, ptr %x, i64 6
-  %g4 = getelementptr inbounds double, ptr %x, i64 8
-  %g5 = getelementptr inbounds double, ptr %x, i64 10
-  %g6 = getelementptr inbounds double, ptr %x, i64 12
-  %g7 = getelementptr inbounds double, ptr %x, i64 14
-  %g8 = getelementptr inbounds double, ptr %x, i64 16
-  %g9 = getelementptr inbounds double, ptr %x, i64 18
-  %g10 = getelementptr inbounds double, ptr %x, i64 20
-  %g11 = getelementptr inbounds double, ptr %x, i64 22
-  %g12 = getelementptr inbounds double, ptr %x, i64 24
-  %g13 = getelementptr inbounds double, ptr %x, i64 26
-  %g14 = getelementptr inbounds double, ptr %x, i64 28
-  %g15 = getelementptr inbounds double, ptr %x, i64 30
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %t4 = load double, ptr %g4, align 4
-  %t5 = load double, ptr %g5, align 4
-  %t6 = load double, ptr %g6, align 4
-  %t7 = load double, ptr %g7, align 4
-  %t8 = load double, ptr %g8, align 4
-  %t9 = load double, ptr %g9, align 4
-  %t10 = load double, ptr %g10, align 4
-  %t11 = load double, ptr %g11, align 4
-  %t12 = load double, ptr %g12, align 4
-  %t13 = load double, ptr %g13, align 4
-  %t14 = load double, ptr %g14, align 4
-  %t15 = load double, ptr %g15, align 4
-  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
-  %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
-  %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
-  %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
-  %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
-  %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
-  %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
-  %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
-  %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
-  %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
-  %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
-  %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
-  %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
-  ret double %m15
-}
-
-define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.minnum.f32(float [[TMP9]], float [[T8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.minnum.f32(float [[T9]], float [[T10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.minnum.f32(float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.minnum.f32(float [[TMP12]], float [[T11]])
-; CHECK-NEXT:    ret float [[TMP13]]
-;
-  %g1 = getelementptr inbounds float, ptr %x, i64 2
-  %g2 = getelementptr inbounds float, ptr %x, i64 4
-  %g3 = getelementptr inbounds float, ptr %x, i64 6
-  %g4 = getelementptr inbounds float, ptr %x, i64 8
-  %g5 = getelementptr inbounds float, ptr %x, i64 10
-  %g6 = getelementptr inbounds float, ptr %x, i64 12
-  %g7 = getelementptr inbounds float, ptr %x, i64 14
-  %g8 = getelementptr inbounds float, ptr %x, i64 16
-  %g9 = getelementptr inbounds float, ptr %x, i64 18
-  %g10 = getelementptr inbounds float, ptr %x, i64 20
-  %g11 = getelementptr inbounds float, ptr %x, i64 22
-  %t0 = load float, ptr %x, align 4
-  %t1 = load float, ptr %g1, align 4
-  %t2 = load float, ptr %g2, align 4
-  %t3 = load float, ptr %g3, align 4
-  %t4 = load float, ptr %g4, align 4
-  %t5 = load float, ptr %g5, align 4
-  %t6 = load float, ptr %g6, align 4
-  %t7 = load float, ptr %g7, align 4
-  %t8 = load float, ptr %g8, align 4
-  %t9 = load float, ptr %g9, align 4
-  %t10 = load float, ptr %g10, align 4
-  %t11 = load float, ptr %g11, align 4
-  %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
-  %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
-  %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
-  %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
-  %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
-  %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
-  %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
-  %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
-  %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
-  %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
-  %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
-  ret float %m11
-}
-
-define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmax_double_4_nums_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 1
-  %g2 = getelementptr inbounds double, ptr %x, i64 2
-  %g3 = getelementptr inbounds double, ptr %x, i64 3
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
-  ret double %m3
-}
-
-define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
-; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
-; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
-; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
-; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
-; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
-; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
-; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
-; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
-; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
-; CHECK-NEXT:    ret double [[TMP17]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 2
-  %g2 = getelementptr inbounds double, ptr %x, i64 4
-  %g3 = getelementptr inbounds double, ptr %x, i64 6
-  %g4 = getelementptr inbounds double, ptr %x, i64 8
-  %g5 = getelementptr inbounds double, ptr %x, i64 10
-  %g6 = getelementptr inbounds double, ptr %x, i64 12
-  %g7 = getelementptr inbounds double, ptr %x, i64 14
-  %g8 = getelementptr inbounds double, ptr %x, i64 16
-  %g9 = getelementptr inbounds double, ptr %x, i64 18
-  %g10 = getelementptr inbounds double, ptr %x, i64 20
-  %g11 = getelementptr inbounds double, ptr %x, i64 22
-  %g12 = getelementptr inbounds double, ptr %x, i64 24
-  %g13 = getelementptr inbounds double, ptr %x, i64 26
-  %g14 = getelementptr inbounds double, ptr %x, i64 28
-  %g15 = getelementptr inbounds double, ptr %x, i64 30
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %t4 = load double, ptr %g4, align 4
-  %t5 = load double, ptr %g5, align 4
-  %t6 = load double, ptr %g6, align 4
-  %t7 = load double, ptr %g7, align 4
-  %t8 = load double, ptr %g8, align 4
-  %t9 = load double, ptr %g9, align 4
-  %t10 = load double, ptr %g10, align 4
-  %t11 = load double, ptr %g11, align 4
-  %t12 = load double, ptr %g12, align 4
-  %t13 = load double, ptr %g13, align 4
-  %t14 = load double, ptr %g14, align 4
-  %t15 = load double, ptr %g15, align 4
-  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
-  %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
-  %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
-  %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
-  %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
-  %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
-  %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
-  %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
-  %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
-  %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
-  %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
-  %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
-  %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
-  ret double %m15
-}
-
-define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP9]], float [[T8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.maxnum.f32(float [[T9]], float [[T10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP12]], float [[T11]])
-; CHECK-NEXT:    ret float [[TMP13]]
-;
-  %g1 = getelementptr inbounds float, ptr %x, i64 2
-  %g2 = getelementptr inbounds float, ptr %x, i64 4
-  %g3 = getelementptr inbounds float, ptr %x, i64 6
-  %g4 = getelementptr inbounds float, ptr %x, i64 8
-  %g5 = getelementptr inbounds float, ptr %x, i64 10
-  %g6 = getelementptr inbounds float, ptr %x, i64 12
-  %g7 = getelementptr inbounds float, ptr %x, i64 14
-  %g8 = getelementptr inbounds float, ptr %x, i64 16
-  %g9 = getelementptr inbounds float, ptr %x, i64 18
-  %g10 = getelementptr inbounds float, ptr %x, i64 20
-  %g11 = getelementptr inbounds float, ptr %x, i64 22
-  %t0 = load float, ptr %x, align 4
-  %t1 = load float, ptr %g1, align 4
-  %t2 = load float, ptr %g2, align 4
-  %t3 = load float, ptr %g3, align 4
-  %t4 = load float, ptr %g4, align 4
-  %t5 = load float, ptr %g5, align 4
-  %t6 = load float, ptr %g6, align 4
-  %t7 = load float, ptr %g7, align 4
-  %t8 = load float, ptr %g8, align 4
-  %t9 = load float, ptr %g9, align 4
-  %t10 = load float, ptr %g10, align 4
-  %t11 = load float, ptr %g11, align 4
-  %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
-  %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
-  %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
-  %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
-  %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
-  %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
-  %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
-  %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
-  %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
-  %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
-  %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
-  ret float %m11
-}
-
-declare float @llvm.minnum.f32(float, float)
-declare double @llvm.minnum.f64(double, double)
-declare float @llvm.maxnum.f32(float, float)
-declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
deleted file mode 100644
index e08b38c69a840d..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN:   | FileCheck %s
-
-; Test vectorization and reassociation of fmul operations. If the loads can
-; be vectorized, cases of fewer operands are also profitable to vectorize.
-
-define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmul_double_4_factors_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
-  %1 = load double, ptr %arrayidx1, align 8
-  %mul = fmul reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
-  %2 = load double, ptr %arrayidx2, align 8
-  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-  %3 = load double, ptr %arrayidx4, align 8
-  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
-  ret double %mul5
-}
-
-define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
-; CHECK-NEXT:    ret double [[TMP16]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
-  %1 = load double, ptr %arrayidx1, align 8
-  %mul = fmul reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
-  %2 = load double, ptr %arrayidx2, align 8
-  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
-  %3 = load double, ptr %arrayidx4, align 8
-  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
-  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
-  %4 = load double, ptr %arrayidx6, align 8
-  %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
-  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
-  %5 = load double, ptr %arrayidx8, align 8
-  %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
-  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
-  %6 = load double, ptr %arrayidx10, align 8
-  %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
-  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
-  %7 = load double, ptr %arrayidx12, align 8
-  %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
-  ret double %mul13
-}
-
-define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
-; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
-; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
-; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
-; CHECK-NEXT:    ret float [[TMP32]]
-;
-entry:
-  %0 = load float, ptr %x, align 4
-  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
-  %1 = load float, ptr %arrayidx1, align 4
-  %mul = fmul reassoc nsz arcp contract afn float %1, %0
-  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
-  %2 = load float, ptr %arrayidx2, align 4
-  %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
-  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
-  %3 = load float, ptr %arrayidx4, align 4
-  %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
-  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
-  %4 = load float, ptr %arrayidx6, align 4
-  %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
-  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
-  %5 = load float, ptr %arrayidx8, align 4
-  %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
-  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
-  %6 = load float, ptr %arrayidx10, align 4
-  %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
-  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
-  %7 = load float, ptr %arrayidx12, align 4
-  %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
-  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
-  %8 = load float, ptr %arrayidx14, align 4
-  %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
-  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
-  %9 = load float, ptr %arrayidx16, align 4
-  %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
-  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
-  %10 = load float, ptr %arrayidx18, align 4
-  %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
-  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
-  %11 = load float, ptr %arrayidx20, align 4
-  %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
-  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
-  %12 = load float, ptr %arrayidx22, align 4
-  %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
-  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
-  %13 = load float, ptr %arrayidx24, align 4
-  %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
-  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
-  %14 = load float, ptr %arrayidx26, align 4
-  %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
-  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
-  %15 = load float, ptr %arrayidx28, align 4
-  %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
-  ret float %mul29
-}

>From 878c642823ace75b02ed8bf12708d398798912f8 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 21 Oct 2024 17:19:29 +0200
Subject: [PATCH 4/7] Cosmetic update per review.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 17c82936408961..92b03d5c19a526 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13017,7 +13017,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
       // loaded directly into a vector element for free.
       APInt FreeEltLoads = APInt::getZero(VL.size());
       if (TTI->supportsEfficientVectorElementLoadStore())
-        for (unsigned I = 0, E = VL.size(); I < E; ++I)
+        for (unsigned I : seq<unsigned>(VL.size()))
           if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
             FreeEltLoads.setBit(I);
       APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;

>From c474e3e0cfb05da713c5ce0a7dadf8604aa09a96 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 22 Oct 2024 14:54:10 +0200
Subject: [PATCH 5/7] Minor updates per review.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp            | 7 +++----
 .../Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll  | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 92b03d5c19a526..62d61c5d12dd7b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3032,10 +3032,9 @@ class BoUpSLP {
       SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
       unsigned NumParts, bool ForOrder = false);
 
-  /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to insert the values from the
-  /// roots. This method calculates the cost of inserting the values.
-  /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
+  /// \returns the cost of gathering (inserting) the values in \p VL into a
+  /// vector.  \param ForPoisonSrc true if initial vector is poison, false
+  /// otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
 
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 7e64b42c52aa94..0c51cb2996dd4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -27,7 +27,7 @@ define void @fun0(ptr nocapture %0, double %1) {
 
 ; This function needs the element-load to be recognized in SystemZ
 ; getVectorInstrCost().
-define void @fun1(double %0) local_unnamed_addr {
+define void @fun1(double %0) {
 ; CHECK-LABEL: define void @fun1(
 ; CHECK:    fsub <2 x double>
 ; CHECK:    fsub <2 x double>

>From 137be573766a5eb12bdaec7b6c25bc9dbf3f96f7 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 15:38:09 +0200
Subject: [PATCH 6/7] Some more minor updates.

---
 .../SystemZ/SystemZTargetTransformInfo.cpp      | 12 +++++-------
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |  4 ++--
 .../SLPVectorizer/SystemZ/vec-elt-insertion.ll  | 17 ++++++++++++++++-
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b98db455c2dd42..0f40ed2be0003a 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,11 +469,9 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
-InstructionCost SystemZTTIImpl::
-getScalarizationOverhead(VectorType *Ty,
-                         const APInt &DemandedElts,
-                         bool Insert, bool Extract,
-                         TTI::TargetCostKind CostKind) {
+InstructionCost SystemZTTIImpl::getScalarizationOverhead(
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind) {
   unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
   InstructionCost Cost = 0;
 
@@ -491,8 +489,8 @@ getScalarizationOverhead(VectorType *Ty,
     Insert = false;
   }
 
-  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
-                                          Extract, CostKind);
+  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                          CostKind);
   return Cost;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 62d61c5d12dd7b..4ddfbc377cf1b9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3033,8 +3033,8 @@ class BoUpSLP {
       unsigned NumParts, bool ForOrder = false);
 
   /// \returns the cost of gathering (inserting) the values in \p VL into a
-  /// vector.  \param ForPoisonSrc true if initial vector is poison, false
-  /// otherwise.
+  /// vector.
+  /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
 
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 0c51cb2996dd4d..722fdc84463e55 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
+; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
+; RUN:   -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
 ;
 ; Test functions that (at least currently) only gets vectorized if the
 ; insertion cost for an element load is counted as free.
@@ -9,6 +11,11 @@ define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
 ; CHECK:    fmul <2 x double>
 ; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+;
+; REMARK-LABEL: Function: fun0
+; REMARK: Args:
+; REMARK-NEXT: - String:          'SLP vectorized with cost '
+; REMARK-NEXT: - Cost:            '-1'
 
   %3 = fmul double %1, 2.000000e+00
   %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -36,6 +43,11 @@ define void @fun1(double %0) {
 ; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
 ; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
 ; CHECK:    %14 = fcmp olt <2 x double> %13, %2
+;
+; REMARK-LABEL: Function: fun1
+; REMARK: Args:
+; REMARK:      - String:          'SLP vectorized with cost '
+; REMARK-NEXT: - Cost:            '-1'
 
   br label %2
 
@@ -72,6 +84,9 @@ declare double @llvm.fmuladd.f64(double, double, double)
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
 ; CHECK-NOT: store <2 x i64>
+;
+; REMARK-NOT: Function: fun2
+
   %3 = load i64, ptr %0, align 8
   %4 = icmp eq i64 %3, 0
   br i1 %4, label %5, label %6

>From 33c9f58f187816691087b845e4cf3584c1522f1e Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 18:04:10 +0200
Subject: [PATCH 7/7] Test updated on top of main.

---
 .../SystemZ/vec-elt-insertion.ll              | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 722fdc84463e55..906ad28c37db98 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -9,8 +9,10 @@
 ; getGatherCost().
 define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
-; CHECK:    fmul <2 x double>
-; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:         fmul <2 x double>
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    call <2 x double> @llvm.sqrt.v2f64(
 ;
 ; REMARK-LABEL: Function: fun0
 ; REMARK: Args:
@@ -36,13 +38,19 @@ define void @fun0(ptr nocapture %0, double %1) {
 ; getVectorInstrCost().
 define void @fun1(double %0) {
 ; CHECK-LABEL: define void @fun1(
-; CHECK:    fsub <2 x double>
-; CHECK:    fsub <2 x double>
-; CHECK:    fsub <2 x double>
-; CHECK:    fmul <2 x double>
-; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK:    %14 = fcmp olt <2 x double> %13, %2
+; CHECK:         phi <2 x double>
+; CHECK-NEXT:    phi <2 x double>
+; CHECK-NEXT:    phi <2 x double>
+; CHECK-NEXT:    fsub <2 x double>
+; CHECK-NEXT:    fsub <2 x double>
+; CHECK-NEXT:    fsub <2 x double>
+; CHECK:         fmul <2 x double>
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    fcmp olt <2 x double>
+; CHECK-NEXT:    extractelement <2 x i1>
+; CHECK-NEXT:    extractelement <2 x i1>
+; CHECK-NEXT:    or i1
 ;
 ; REMARK-LABEL: Function: fun1
 ; REMARK: Args:
@@ -83,7 +91,8 @@ declare double @llvm.fmuladd.f64(double, double, double)
 ; which is recognized in SystemZTTImpl::getScalarizationOverhead().
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
-; CHECK-NOT: store <2 x i64>
+; CHECK: store i64
+; CHECK: store i64
 ;
 ; REMARK-NOT: Function: fun2