[llvm] [SystemZ] SLP reductions: cost functions of reductions and scalarization (PR #112491)

Fri Nov 29 11:15:47 PST 2024

https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/112491

>From 931f64e7a83f31a1aec34f133675879265956787 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:51:20 +0200
Subject: [PATCH 01/11] Improvements to vector elements insertion costs.

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 67 ++++++++++++++-----
 .../SystemZ/SystemZTargetTransformInfo.h      |  4 ++
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 14 +++-
 .../SystemZ/vec-elt-insertion.ll              | 66 ++++--------------
 4 files changed, 79 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index a586eedd58b667..efea2713ea2fd7 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -468,6 +468,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
+InstructionCost SystemZTTIImpl::
+getScalarizationOverhead(VectorType *Ty,
+                         const APInt &DemandedElts,
+                         bool Insert, bool Extract,
+                         TTI::TargetCostKind CostKind) {
+  unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+  InstructionCost Cost = 0;
+
+  if (Insert && Ty->isIntOrIntVectorTy(64)) {
+    // VLVGP will insert two GPRs with one instruction.
+    InstructionCost CurrVectorCost = 0;
+    for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+      if (DemandedElts[Idx])
+        ++CurrVectorCost;
+      if (Idx % 2 == 1) {
+        Cost += std::min(InstructionCost(1), CurrVectorCost);
+        CurrVectorCost = 0;
+      }
+    }
+    Insert = false;
+  }
+
+  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+                                          Extract, CostKind);
+  return Cost;
+}
+
 // Return the bit size for the scalar type or vector element
 // type. getScalarSizeInBits() returns 0 for a pointer type.
 static unsigned getScalarSizeInBits(Type *Ty) {
@@ -609,7 +636,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (DivRemConst) {
       SmallVector<Type *> Tys(Args.size(), Ty);
       return VF * DivMulSeqCost +
-             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+             BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
     }
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
@@ -636,7 +663,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
         SmallVector<Type *> Tys(Args.size(), Ty);
         InstructionCost Cost =
             (VF * ScalarCost) +
-            getScalarizationOverhead(VTy, Args, Tys, CostKind);
+            BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -654,8 +681,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
       SmallVector<Type *> Tys(Args.size(), Ty);
-      InstructionCost Cost = (VF * LIBCALL_COST) +
-                             getScalarizationOverhead(VTy, Args, Tys, CostKind);
+      InstructionCost Cost =
+          (VF * LIBCALL_COST) +
+          BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
@@ -975,10 +1003,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                          NeedsExtracts, CostKind);
-      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
-                                          /*Extract*/ false, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                 NeedsExtracts, CostKind);
+      TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
+                                                 /*Extract*/ false, CostKind);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -990,8 +1018,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
         return VF /*ldxbr/lexbr*/ +
-               getScalarizationOverhead(DstVecTy, /*Insert*/ true,
-                                        /*Extract*/ false, CostKind);
+               BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+                                               /*Extract*/ false, CostKind);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -1004,8 +1032,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
-                                           /*Extract*/ true, CostKind);
+      return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+                                                  /*Extract*/ true, CostKind);
     }
   }
 
@@ -1114,10 +1142,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index, Value *Op0,
                                                    Value *Op1) {
-  // vlvgp will insert two grs into a vector register, so only count half the
-  // number of instructions.
-  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
-    return ((Index % 2 == 0) ? 1 : 0);
+  if (Opcode == Instruction::InsertElement) {
+    // Vector Element Load.
+    if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+      return 0;
+
+    // vlvgp will insert two grs into a vector register, so count half the
+    // number of instructions as an estimate when we don't have the full
+    // picture (as in getScalarizationOverhead()).
+    if (Val->isIntOrIntVectorTy(64))
+      return ((Index % 2 == 0) ? 1 : 0);
+  }
 
   if (Opcode == Instruction::ExtractElement) {
     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 8cc71a6c528f82..80294ada23c3a9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
   bool LSRWithInstrQueries() { return true; }
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind);
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7723442bc0fb6e..d405af5e43e507 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3111,8 +3111,8 @@ class BoUpSLP {
       unsigned NumParts, bool ForOrder = false);
 
   /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to extract the values from the
-  /// roots. This method calculates the cost of extracting the values.
+  /// this subtree gets vectorized, we may need to insert the values from the
+  /// roots. This method calculates the cost of inserting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
@@ -13498,7 +13498,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
               TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
               I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
     } else {
-      Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+      // Add insertion costs for all elements, but not for loads that can be
+      // loaded directly into a vector element for free.
+      APInt FreeEltLoads = APInt::getZero(VL.size());
+      if (TTI->supportsEfficientVectorElementLoadStore())
+        for (unsigned I = 0, E = VL.size(); I < E; ++I)
+          if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
+            FreeEltLoads.setBit(I);
+      APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
+      Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
                                            /*Insert*/ true,
                                            /*Extract*/ false, CostKind);
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index eb8dd72e0304d9..7e64b42c52aa94 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,8 +1,4 @@
-; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
-; RUN:   -pass-remarks-output=%t | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=REMARK %s
-;
-; NB! This is a pre-commit version (for #112491) with current codegen and remarks.
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
 ;
 ; Test functions that (at least currently) only gets vectorized if the
 ; insertion cost for an element load is counted as free.
@@ -11,19 +7,8 @@
 ; getGatherCost().
 define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
-; CHECK:         fmul double
-; CHECK:         call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.sqrt.f64(
-; CHECK:         fmul double
-; CHECK:         call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.sqrt.f64(
-;
-; REMARK-LABEL: Function: fun0
-; REMARK: Args:
-; REMARK-NEXT: - String:          'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost:            '0'
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
 
   %3 = fmul double %1, 2.000000e+00
   %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -39,36 +24,18 @@ define void @fun0(ptr nocapture %0, double %1) {
   ret void
 }
 
+
 ; This function needs the element-load to be recognized in SystemZ
 ; getVectorInstrCost().
-define void @fun1(double %0) {
+define void @fun1(double %0) local_unnamed_addr {
 ; CHECK-LABEL: define void @fun1(
-; CHECK:         phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    phi double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fmul double
-; CHECK-NEXT:    fmul double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    fsub double
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK-NEXT:    call double @llvm.fmuladd.f64(
-; CHECK:         fcmp olt double
-; CHECK-NEXT:    fcmp olt double
-; CHECK-NEXT:    or i1
-;
-; REMARK-LABEL: Function: fun1
-; REMARK: Args:
-; REMARK:      - String:          'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost:            '0'
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fsub <2 x double>
+; CHECK:    fmul <2 x double>
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:    %14 = fcmp olt <2 x double> %13, %2
 
   br label %2
 
@@ -104,14 +71,7 @@ declare double @llvm.fmuladd.f64(double, double, double)
 ; which is recognized in SystemZTTImpl::getScalarizationOverhead().
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
-; CHECK: insertelement
-; CHECK: store <2 x i64>
-;
-; REMARK-LABEL: Function: fun2
-; REMARK: Args:
-; REMARK-NEXT: - String:          'Stores SLP vectorized with cost '
-; REMARK-NEXT: - Cost:            '-1'
-
+; CHECK-NOT: store <2 x i64>
   %3 = load i64, ptr %0, align 8
   %4 = icmp eq i64 %3, 0
   br i1 %4, label %5, label %6

>From 4fc06d65fcf2cd07474d2e59583e7eb3ee008b1c Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:52:37 +0200
Subject: [PATCH 02/11] FP reduction cost functions (for SLP)

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  51 +++
 .../SystemZ/SystemZTargetTransformInfo.h      |   7 +
 .../CostModel/SystemZ/vector-reductions-fp.ll | 131 ++++++
 .../SLPVectorizer/SystemZ/reductions-fadd.ll  | 188 ++++++++
 .../SystemZ/reductions-fmin-fmax.ll           | 411 ++++++++++++++++++
 .../SLPVectorizer/SystemZ/reductions-fmul.ll  | 188 ++++++++
 6 files changed, 976 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index efea2713ea2fd7..965431268b7dc4 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1387,6 +1387,57 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
+// EXPERIMENTAL
+static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
+
+InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
+  unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
+  InstructionCost Cost = 0;
+  Cost += NumVec - 1;        // Full vector operations.
+  Cost += NumEltsPerVecReg;  // Last vector scalar operations.
+  return Cost;
+}
+
+InstructionCost
+SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                           std::optional<FastMathFlags> FMF,
+                                           TTI::TargetCostKind CostKind) {
+  if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
+      (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+    // // EXPERIMENTAL: better to not vectorize small vectors?:
+    // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+    // if (NumElts <= REDLIM)
+    //   return NumVectors * 8;  // => MachineCombiner
+
+    // // EXPERIMENTAL: Return a low cost to enable heavily.
+    // return NumVectors / 2;
+
+    return getFPReductionCost(NumVectors, ScalarBits);
+  }
+
+  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+}
+
+InstructionCost
+SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                       FastMathFlags FMF,
+                                       TTI::TargetCostKind CostKind) {
+  if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
+    unsigned NumVectors = getNumVectorRegs(Ty);
+    unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+    // // EXPERIMENTAL: Return a low cost to enable heavily.
+    // return NumVectors / 2;
+
+    return getFPReductionCost(NumVectors, ScalarBits);
+  }
+
+  return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
+}
+
 static int
 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 80294ada23c3a9..b65e75ab98814c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -129,6 +129,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
+  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                             std::optional<FastMathFlags> FMF,
+                                             TTI::TargetCostKind CostKind);
+  InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+                                         FastMathFlags FMF,
+                                         TTI::TargetCostKind CostKind);
+
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
new file mode 100644
index 00000000000000..055c25298d847e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
+; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
+
+define void @fadd_reductions() {
+; Z15-LABEL: 'fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fadd_reductions() {
+; Z15-LABEL: 'fast_fadd_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+
+  ret void
+}
+
+define void @fmul_reductions() {
+; Z15-LABEL: 'fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+  ret void
+}
+
+define void @fast_fmul_reductions() {
+; Z15-LABEL: 'fast_fmul_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+  %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+  %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+  %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+
+  ret void
+}
+
+define void @fmin_reductions() {
+; Z15-LABEL: 'fmin_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+define void @fmax_reductions() {
+; Z15-LABEL: 'fmax_reductions'
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+  %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+  %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+  %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+  %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+  ret void
+}
+
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
new file mode 100644
index 00000000000000..fa0587f1da931b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+;
+; Test vectorization and reassociation of fadd operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fadd_double_4_addends_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  ret double %add5
+}
+
+define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %add = fadd reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
+  ret double %add13
+}
+
+define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x)  {
+; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %add3 = fadd reassoc nsz arcp contract afn float %add, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
+  ret float %add29
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
new file mode 100644
index 00000000000000..5a466178ba786b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmin/fmax operations. Vectorization
+; is more profitable if the loads are also vectorizable.
+
+define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.minnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.minnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.minnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.minnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 1
+  %g2 = getelementptr inbounds double, ptr %x, i64 2
+  %g3 = getelementptr inbounds double, ptr %x, i64 3
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  ret double %m3
+}
+
+define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT:    ret double [[TMP17]]
+;
+  %g1 = getelementptr inbounds double, ptr %x, i64 2
+  %g2 = getelementptr inbounds double, ptr %x, i64 4
+  %g3 = getelementptr inbounds double, ptr %x, i64 6
+  %g4 = getelementptr inbounds double, ptr %x, i64 8
+  %g5 = getelementptr inbounds double, ptr %x, i64 10
+  %g6 = getelementptr inbounds double, ptr %x, i64 12
+  %g7 = getelementptr inbounds double, ptr %x, i64 14
+  %g8 = getelementptr inbounds double, ptr %x, i64 16
+  %g9 = getelementptr inbounds double, ptr %x, i64 18
+  %g10 = getelementptr inbounds double, ptr %x, i64 20
+  %g11 = getelementptr inbounds double, ptr %x, i64 22
+  %g12 = getelementptr inbounds double, ptr %x, i64 24
+  %g13 = getelementptr inbounds double, ptr %x, i64 26
+  %g14 = getelementptr inbounds double, ptr %x, i64 28
+  %g15 = getelementptr inbounds double, ptr %x, i64 30
+  %t0 = load double, ptr %x, align 4
+  %t1 = load double, ptr %g1, align 4
+  %t2 = load double, ptr %g2, align 4
+  %t3 = load double, ptr %g3, align 4
+  %t4 = load double, ptr %g4, align 4
+  %t5 = load double, ptr %g5, align 4
+  %t6 = load double, ptr %g6, align 4
+  %t7 = load double, ptr %g7, align 4
+  %t8 = load double, ptr %g8, align 4
+  %t9 = load double, ptr %g9, align 4
+  %t10 = load double, ptr %g10, align 4
+  %t11 = load double, ptr %g11, align 4
+  %t12 = load double, ptr %g12, align 4
+  %t13 = load double, ptr %g13, align 4
+  %t14 = load double, ptr %g14, align 4
+  %t15 = load double, ptr %g15, align 4
+  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+  %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
+  %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
+  %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
+  %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
+  %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
+  %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
+  %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
+  %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
+  %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
+  %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
+  %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
+  %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
+  ret double %m15
+}
+
+define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.maxnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT:    ret float [[TMP13]]
+;
+  %g1 = getelementptr inbounds float, ptr %x, i64 2
+  %g2 = getelementptr inbounds float, ptr %x, i64 4
+  %g3 = getelementptr inbounds float, ptr %x, i64 6
+  %g4 = getelementptr inbounds float, ptr %x, i64 8
+  %g5 = getelementptr inbounds float, ptr %x, i64 10
+  %g6 = getelementptr inbounds float, ptr %x, i64 12
+  %g7 = getelementptr inbounds float, ptr %x, i64 14
+  %g8 = getelementptr inbounds float, ptr %x, i64 16
+  %g9 = getelementptr inbounds float, ptr %x, i64 18
+  %g10 = getelementptr inbounds float, ptr %x, i64 20
+  %g11 = getelementptr inbounds float, ptr %x, i64 22
+  %t0 = load float, ptr %x, align 4
+  %t1 = load float, ptr %g1, align 4
+  %t2 = load float, ptr %g2, align 4
+  %t3 = load float, ptr %g3, align 4
+  %t4 = load float, ptr %g4, align 4
+  %t5 = load float, ptr %g5, align 4
+  %t6 = load float, ptr %g6, align 4
+  %t7 = load float, ptr %g7, align 4
+  %t8 = load float, ptr %g8, align 4
+  %t9 = load float, ptr %g9, align 4
+  %t10 = load float, ptr %g10, align 4
+  %t11 = load float, ptr %g11, align 4
+  %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
+  %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
+  %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
+  %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
+  %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
+  %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
+  %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
+  %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
+  %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
+  %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
+  %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
+  ret float %m11
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
new file mode 100644
index 00000000000000..e08b38c69a840d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+
+; Test vectorization and reassociation of fmul operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_4_factors_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  ret double %mul5
+}
+
+define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT:    ret double [[TMP16]]
+;
+entry:
+  %0 = load double, ptr %x, align 8
+  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+  %1 = load double, ptr %arrayidx1, align 8
+  %mul = fmul reassoc nsz arcp contract afn double %1, %0
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+  %2 = load double, ptr %arrayidx2, align 8
+  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+  %3 = load double, ptr %arrayidx4, align 8
+  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+  %4 = load double, ptr %arrayidx6, align 8
+  %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
+  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+  %5 = load double, ptr %arrayidx8, align 8
+  %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
+  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+  %6 = load double, ptr %arrayidx10, align 8
+  %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
+  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+  %7 = load double, ptr %arrayidx12, align 8
+  %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
+  ret double %mul13
+}
+
+define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
+; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+entry:
+  %0 = load float, ptr %x, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul reassoc nsz arcp contract afn float %1, %0
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+  %2 = load float, ptr %arrayidx2, align 4
+  %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
+  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+  %3 = load float, ptr %arrayidx4, align 4
+  %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
+  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+  %4 = load float, ptr %arrayidx6, align 4
+  %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
+  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+  %5 = load float, ptr %arrayidx8, align 4
+  %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
+  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+  %6 = load float, ptr %arrayidx10, align 4
+  %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
+  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+  %7 = load float, ptr %arrayidx12, align 4
+  %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
+  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+  %8 = load float, ptr %arrayidx14, align 4
+  %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
+  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+  %9 = load float, ptr %arrayidx16, align 4
+  %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
+  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+  %10 = load float, ptr %arrayidx18, align 4
+  %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
+  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+  %11 = load float, ptr %arrayidx20, align 4
+  %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
+  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+  %12 = load float, ptr %arrayidx22, align 4
+  %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
+  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+  %13 = load float, ptr %arrayidx24, align 4
+  %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
+  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+  %14 = load float, ptr %arrayidx26, align 4
+  %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
+  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+  %15 = load float, ptr %arrayidx28, align 4
+  %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
+  ret float %mul29
+}

>From 46e08ae0f88876d24ce9a1f703037bdc4acc3234 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 21 Oct 2024 17:16:24 +0200
Subject: [PATCH 03/11] Revert "FP reduction cost functions (for SLP)"

Wait with this and first evaluate the scalarization costs separately.
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  51 ---
 .../SystemZ/SystemZTargetTransformInfo.h      |   7 -
 .../CostModel/SystemZ/vector-reductions-fp.ll | 131 ------
 .../SLPVectorizer/SystemZ/reductions-fadd.ll  | 188 --------
 .../SystemZ/reductions-fmin-fmax.ll           | 411 ------------------
 .../SLPVectorizer/SystemZ/reductions-fmul.ll  | 188 --------
 6 files changed, 976 deletions(-)
 delete mode 100644 llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 965431268b7dc4..efea2713ea2fd7 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1387,57 +1387,6 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
   return NumVectorMemOps + NumPermutes;
 }
 
-// EXPERIMENTAL
-static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
-
-InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
-  unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
-  InstructionCost Cost = 0;
-  Cost += NumVec - 1;        // Full vector operations.
-  Cost += NumEltsPerVecReg;  // Last vector scalar operations.
-  return Cost;
-}
-
-InstructionCost
-SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                                           std::optional<FastMathFlags> FMF,
-                                           TTI::TargetCostKind CostKind) {
-  if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
-      (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
-    unsigned NumVectors = getNumVectorRegs(Ty);
-    unsigned ScalarBits = Ty->getScalarSizeInBits();
-
-    // // EXPERIMENTAL: better to not vectorize small vectors?:
-    // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
-    // if (NumElts <= REDLIM)
-    //   return NumVectors * 8;  // => MachineCombiner
-
-    // // EXPERIMENTAL: Return a low cost to enable heavily.
-    // return NumVectors / 2;
-
-    return getFPReductionCost(NumVectors, ScalarBits);
-  }
-
-  return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
-}
-
-InstructionCost
-SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
-                                       FastMathFlags FMF,
-                                       TTI::TargetCostKind CostKind) {
-  if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
-    unsigned NumVectors = getNumVectorRegs(Ty);
-    unsigned ScalarBits = Ty->getScalarSizeInBits();
-
-    // // EXPERIMENTAL: Return a low cost to enable heavily.
-    // return NumVectors / 2;
-
-    return getFPReductionCost(NumVectors, ScalarBits);
-  }
-
-  return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
-}
-
 static int
 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index b65e75ab98814c..80294ada23c3a9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -129,13 +129,6 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
-  InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
-                                             std::optional<FastMathFlags> FMF,
-                                             TTI::TargetCostKind CostKind);
-  InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
-                                         FastMathFlags FMF,
-                                         TTI::TargetCostKind CostKind);
-
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
 
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
deleted file mode 100644
index 055c25298d847e..00000000000000
--- a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
-; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
-
-define void @fadd_reductions() {
-; Z15-LABEL: 'fadd_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
-  %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
-  %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
-  %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
-  %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-  ret void
-}
-
-define void @fast_fadd_reductions() {
-; Z15-LABEL: 'fast_fadd_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
-  %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
-  %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
-  %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
-  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-
-  ret void
-}
-
-define void @fmul_reductions() {
-; Z15-LABEL: 'fmul_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
-  %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
-  %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
-  %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
-  %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-  ret void
-}
-
-define void @fast_fmul_reductions() {
-; Z15-LABEL: 'fast_fmul_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
-  %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
-  %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
-  %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
-  %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-
-  ret void
-}
-
-define void @fmin_reductions() {
-; Z15-LABEL: 'fmin_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-  %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-  %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-  %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-  %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-  ret void
-}
-
-define void @fmax_reductions() {
-; Z15-LABEL: 'fmax_reductions'
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-  %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-  %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-  %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-  %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-  ret void
-}
-
-declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
-
-declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
-
-declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
-
-declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
deleted file mode 100644
index fa0587f1da931b..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN:   | FileCheck %s
-;
-; Test vectorization and reassociation of fadd operations. If the loads can
-; be vectorized, cases of fewer operands are also profitable to vectorize.
-
-define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fadd_double_4_addends_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
-  %1 = load double, ptr %arrayidx1, align 8
-  %add = fadd reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
-  %2 = load double, ptr %arrayidx2, align 8
-  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-  %3 = load double, ptr %arrayidx4, align 8
-  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
-  ret double %add5
-}
-
-define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x)  {
-; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
-; CHECK-NEXT:    ret double [[TMP16]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
-  %1 = load double, ptr %arrayidx1, align 8
-  %add = fadd reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
-  %2 = load double, ptr %arrayidx2, align 8
-  %add3 = fadd reassoc nsz arcp contract afn double %add, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
-  %3 = load double, ptr %arrayidx4, align 8
-  %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
-  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
-  %4 = load double, ptr %arrayidx6, align 8
-  %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
-  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
-  %5 = load double, ptr %arrayidx8, align 8
-  %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
-  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
-  %6 = load double, ptr %arrayidx10, align 8
-  %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
-  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
-  %7 = load double, ptr %arrayidx12, align 8
-  %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
-  ret double %add13
-}
-
-define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x)  {
-; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
-; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
-; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
-; CHECK-NEXT:    ret float [[TMP32]]
-;
-entry:
-  %0 = load float, ptr %x, align 4
-  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
-  %1 = load float, ptr %arrayidx1, align 4
-  %add = fadd reassoc nsz arcp contract afn float %1, %0
-  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
-  %2 = load float, ptr %arrayidx2, align 4
-  %add3 = fadd reassoc nsz arcp contract afn float %add, %2
-  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
-  %3 = load float, ptr %arrayidx4, align 4
-  %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
-  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
-  %4 = load float, ptr %arrayidx6, align 4
-  %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
-  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
-  %5 = load float, ptr %arrayidx8, align 4
-  %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
-  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
-  %6 = load float, ptr %arrayidx10, align 4
-  %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
-  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
-  %7 = load float, ptr %arrayidx12, align 4
-  %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
-  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
-  %8 = load float, ptr %arrayidx14, align 4
-  %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
-  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
-  %9 = load float, ptr %arrayidx16, align 4
-  %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
-  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
-  %10 = load float, ptr %arrayidx18, align 4
-  %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
-  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
-  %11 = load float, ptr %arrayidx20, align 4
-  %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
-  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
-  %12 = load float, ptr %arrayidx22, align 4
-  %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
-  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
-  %13 = load float, ptr %arrayidx24, align 4
-  %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
-  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
-  %14 = load float, ptr %arrayidx26, align 4
-  %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
-  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
-  %15 = load float, ptr %arrayidx28, align 4
-  %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
-  ret float %add29
-}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
deleted file mode 100644
index 5a466178ba786b..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
+++ /dev/null
@@ -1,411 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN:   | FileCheck %s
-
-; Test vectorization and reassociation of fmin/fmax operations. Vectorization
-; is more profitable if the loads are also vectorizable.
-
-define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmin_double_4_nums_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 1
-  %g2 = getelementptr inbounds double, ptr %x, i64 2
-  %g3 = getelementptr inbounds double, ptr %x, i64 3
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
-  ret double %m3
-}
-
-define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
-; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
-; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
-; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
-; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
-; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
-; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
-; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
-; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
-; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
-; CHECK-NEXT:    ret double [[TMP17]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 2
-  %g2 = getelementptr inbounds double, ptr %x, i64 4
-  %g3 = getelementptr inbounds double, ptr %x, i64 6
-  %g4 = getelementptr inbounds double, ptr %x, i64 8
-  %g5 = getelementptr inbounds double, ptr %x, i64 10
-  %g6 = getelementptr inbounds double, ptr %x, i64 12
-  %g7 = getelementptr inbounds double, ptr %x, i64 14
-  %g8 = getelementptr inbounds double, ptr %x, i64 16
-  %g9 = getelementptr inbounds double, ptr %x, i64 18
-  %g10 = getelementptr inbounds double, ptr %x, i64 20
-  %g11 = getelementptr inbounds double, ptr %x, i64 22
-  %g12 = getelementptr inbounds double, ptr %x, i64 24
-  %g13 = getelementptr inbounds double, ptr %x, i64 26
-  %g14 = getelementptr inbounds double, ptr %x, i64 28
-  %g15 = getelementptr inbounds double, ptr %x, i64 30
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %t4 = load double, ptr %g4, align 4
-  %t5 = load double, ptr %g5, align 4
-  %t6 = load double, ptr %g6, align 4
-  %t7 = load double, ptr %g7, align 4
-  %t8 = load double, ptr %g8, align 4
-  %t9 = load double, ptr %g9, align 4
-  %t10 = load double, ptr %g10, align 4
-  %t11 = load double, ptr %g11, align 4
-  %t12 = load double, ptr %g12, align 4
-  %t13 = load double, ptr %g13, align 4
-  %t14 = load double, ptr %g14, align 4
-  %t15 = load double, ptr %g15, align 4
-  %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
-  %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
-  %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
-  %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
-  %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
-  %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
-  %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
-  %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
-  %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
-  %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
-  %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
-  %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
-  %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
-  ret double %m15
-}
-
-define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.minnum.f32(float [[TMP9]], float [[T8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.minnum.f32(float [[T9]], float [[T10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.minnum.f32(float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.minnum.f32(float [[TMP12]], float [[T11]])
-; CHECK-NEXT:    ret float [[TMP13]]
-;
-  %g1 = getelementptr inbounds float, ptr %x, i64 2
-  %g2 = getelementptr inbounds float, ptr %x, i64 4
-  %g3 = getelementptr inbounds float, ptr %x, i64 6
-  %g4 = getelementptr inbounds float, ptr %x, i64 8
-  %g5 = getelementptr inbounds float, ptr %x, i64 10
-  %g6 = getelementptr inbounds float, ptr %x, i64 12
-  %g7 = getelementptr inbounds float, ptr %x, i64 14
-  %g8 = getelementptr inbounds float, ptr %x, i64 16
-  %g9 = getelementptr inbounds float, ptr %x, i64 18
-  %g10 = getelementptr inbounds float, ptr %x, i64 20
-  %g11 = getelementptr inbounds float, ptr %x, i64 22
-  %t0 = load float, ptr %x, align 4
-  %t1 = load float, ptr %g1, align 4
-  %t2 = load float, ptr %g2, align 4
-  %t3 = load float, ptr %g3, align 4
-  %t4 = load float, ptr %g4, align 4
-  %t5 = load float, ptr %g5, align 4
-  %t6 = load float, ptr %g6, align 4
-  %t7 = load float, ptr %g7, align 4
-  %t8 = load float, ptr %g8, align 4
-  %t9 = load float, ptr %g9, align 4
-  %t10 = load float, ptr %g10, align 4
-  %t11 = load float, ptr %g11, align 4
-  %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
-  %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
-  %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
-  %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
-  %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
-  %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
-  %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
-  %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
-  %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
-  %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
-  %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
-  ret float %m11
-}
-
-define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmax_double_4_nums_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
-; CHECK-NEXT:    ret double [[TMP2]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 1
-  %g2 = getelementptr inbounds double, ptr %x, i64 2
-  %g3 = getelementptr inbounds double, ptr %x, i64 3
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
-  ret double %m3
-}
-
-define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
-; CHECK-NEXT:    [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
-; CHECK-NEXT:    [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
-; CHECK-NEXT:    [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
-; CHECK-NEXT:    [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
-; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load double, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load double, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load double, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load double, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load double, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load double, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load double, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load double, ptr [[G11]], align 4
-; CHECK-NEXT:    [[T12:%.*]] = load double, ptr [[G12]], align 4
-; CHECK-NEXT:    [[T13:%.*]] = load double, ptr [[G13]], align 4
-; CHECK-NEXT:    [[T14:%.*]] = load double, ptr [[G14]], align 4
-; CHECK-NEXT:    [[T15:%.*]] = load double, ptr [[G15]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
-; CHECK-NEXT:    [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
-; CHECK-NEXT:    ret double [[TMP17]]
-;
-  %g1 = getelementptr inbounds double, ptr %x, i64 2
-  %g2 = getelementptr inbounds double, ptr %x, i64 4
-  %g3 = getelementptr inbounds double, ptr %x, i64 6
-  %g4 = getelementptr inbounds double, ptr %x, i64 8
-  %g5 = getelementptr inbounds double, ptr %x, i64 10
-  %g6 = getelementptr inbounds double, ptr %x, i64 12
-  %g7 = getelementptr inbounds double, ptr %x, i64 14
-  %g8 = getelementptr inbounds double, ptr %x, i64 16
-  %g9 = getelementptr inbounds double, ptr %x, i64 18
-  %g10 = getelementptr inbounds double, ptr %x, i64 20
-  %g11 = getelementptr inbounds double, ptr %x, i64 22
-  %g12 = getelementptr inbounds double, ptr %x, i64 24
-  %g13 = getelementptr inbounds double, ptr %x, i64 26
-  %g14 = getelementptr inbounds double, ptr %x, i64 28
-  %g15 = getelementptr inbounds double, ptr %x, i64 30
-  %t0 = load double, ptr %x, align 4
-  %t1 = load double, ptr %g1, align 4
-  %t2 = load double, ptr %g2, align 4
-  %t3 = load double, ptr %g3, align 4
-  %t4 = load double, ptr %g4, align 4
-  %t5 = load double, ptr %g5, align 4
-  %t6 = load double, ptr %g6, align 4
-  %t7 = load double, ptr %g7, align 4
-  %t8 = load double, ptr %g8, align 4
-  %t9 = load double, ptr %g9, align 4
-  %t10 = load double, ptr %g10, align 4
-  %t11 = load double, ptr %g11, align 4
-  %t12 = load double, ptr %g12, align 4
-  %t13 = load double, ptr %g13, align 4
-  %t14 = load double, ptr %g14, align 4
-  %t15 = load double, ptr %g15, align 4
-  %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
-  %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
-  %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
-  %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
-  %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
-  %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
-  %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
-  %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
-  %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
-  %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
-  %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
-  %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
-  %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
-  %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
-  %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
-  ret double %m15
-}
-
-define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, ptr [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, ptr [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, ptr [[G7]], align 4
-; CHECK-NEXT:    [[T8:%.*]] = load float, ptr [[G8]], align 4
-; CHECK-NEXT:    [[T9:%.*]] = load float, ptr [[G9]], align 4
-; CHECK-NEXT:    [[T10:%.*]] = load float, ptr [[G10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = load float, ptr [[G11]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP9]], float [[T8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.maxnum.f32(float [[T9]], float [[T10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP12]], float [[T11]])
-; CHECK-NEXT:    ret float [[TMP13]]
-;
-  %g1 = getelementptr inbounds float, ptr %x, i64 2
-  %g2 = getelementptr inbounds float, ptr %x, i64 4
-  %g3 = getelementptr inbounds float, ptr %x, i64 6
-  %g4 = getelementptr inbounds float, ptr %x, i64 8
-  %g5 = getelementptr inbounds float, ptr %x, i64 10
-  %g6 = getelementptr inbounds float, ptr %x, i64 12
-  %g7 = getelementptr inbounds float, ptr %x, i64 14
-  %g8 = getelementptr inbounds float, ptr %x, i64 16
-  %g9 = getelementptr inbounds float, ptr %x, i64 18
-  %g10 = getelementptr inbounds float, ptr %x, i64 20
-  %g11 = getelementptr inbounds float, ptr %x, i64 22
-  %t0 = load float, ptr %x, align 4
-  %t1 = load float, ptr %g1, align 4
-  %t2 = load float, ptr %g2, align 4
-  %t3 = load float, ptr %g3, align 4
-  %t4 = load float, ptr %g4, align 4
-  %t5 = load float, ptr %g5, align 4
-  %t6 = load float, ptr %g6, align 4
-  %t7 = load float, ptr %g7, align 4
-  %t8 = load float, ptr %g8, align 4
-  %t9 = load float, ptr %g9, align 4
-  %t10 = load float, ptr %g10, align 4
-  %t11 = load float, ptr %g11, align 4
-  %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
-  %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
-  %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
-  %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
-  %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
-  %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
-  %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
-  %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
-  %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
-  %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
-  %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
-  ret float %m11
-}
-
-declare float @llvm.minnum.f32(float, float)
-declare double @llvm.minnum.f64(double, double)
-declare float @llvm.maxnum.f32(float, float)
-declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
deleted file mode 100644
index e08b38c69a840d..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN:   | FileCheck %s
-
-; Test vectorization and reassociation of fmul operations. If the loads can
-; be vectorized, cases of fewer operands are also profitable to vectorize.
-
-define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmul_double_4_factors_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
-; CHECK-NEXT:    ret double [[TMP1]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
-  %1 = load double, ptr %arrayidx1, align 8
-  %mul = fmul reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
-  %2 = load double, ptr %arrayidx2, align 8
-  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-  %3 = load double, ptr %arrayidx4, align 8
-  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
-  ret double %mul5
-}
-
-define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
-; CHECK-NEXT:    ret double [[TMP16]]
-;
-entry:
-  %0 = load double, ptr %x, align 8
-  %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
-  %1 = load double, ptr %arrayidx1, align 8
-  %mul = fmul reassoc nsz arcp contract afn double %1, %0
-  %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
-  %2 = load double, ptr %arrayidx2, align 8
-  %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
-  %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
-  %3 = load double, ptr %arrayidx4, align 8
-  %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
-  %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
-  %4 = load double, ptr %arrayidx6, align 8
-  %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
-  %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
-  %5 = load double, ptr %arrayidx8, align 8
-  %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
-  %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
-  %6 = load double, ptr %arrayidx10, align 8
-  %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
-  %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
-  %7 = load double, ptr %arrayidx12, align 8
-  %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
-  ret double %mul13
-}
-
-define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
-; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
-; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
-; CHECK-NEXT:    [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
-; CHECK-NEXT:    ret float [[TMP32]]
-;
-entry:
-  %0 = load float, ptr %x, align 4
-  %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
-  %1 = load float, ptr %arrayidx1, align 4
-  %mul = fmul reassoc nsz arcp contract afn float %1, %0
-  %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
-  %2 = load float, ptr %arrayidx2, align 4
-  %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
-  %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
-  %3 = load float, ptr %arrayidx4, align 4
-  %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
-  %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
-  %4 = load float, ptr %arrayidx6, align 4
-  %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
-  %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
-  %5 = load float, ptr %arrayidx8, align 4
-  %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
-  %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
-  %6 = load float, ptr %arrayidx10, align 4
-  %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
-  %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
-  %7 = load float, ptr %arrayidx12, align 4
-  %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
-  %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
-  %8 = load float, ptr %arrayidx14, align 4
-  %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
-  %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
-  %9 = load float, ptr %arrayidx16, align 4
-  %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
-  %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
-  %10 = load float, ptr %arrayidx18, align 4
-  %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
-  %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
-  %11 = load float, ptr %arrayidx20, align 4
-  %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
-  %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
-  %12 = load float, ptr %arrayidx22, align 4
-  %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
-  %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
-  %13 = load float, ptr %arrayidx24, align 4
-  %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
-  %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
-  %14 = load float, ptr %arrayidx26, align 4
-  %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
-  %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
-  %15 = load float, ptr %arrayidx28, align 4
-  %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
-  ret float %mul29
-}

>From d032d986f5d152c2d1108b0f29130a1319073dfb Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 21 Oct 2024 17:19:29 +0200
Subject: [PATCH 04/11] Cosmetic update per review.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d405af5e43e507..757fad1578c3ca 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13502,7 +13502,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
       // loaded directly into a vector element for free.
       APInt FreeEltLoads = APInt::getZero(VL.size());
       if (TTI->supportsEfficientVectorElementLoadStore())
-        for (unsigned I = 0, E = VL.size(); I < E; ++I)
+        for (unsigned I : seq<unsigned>(VL.size()))
           if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
             FreeEltLoads.setBit(I);
       APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;

>From c894b79b7d41286e132f2ae6859b98d84c4877e1 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 22 Oct 2024 14:54:10 +0200
Subject: [PATCH 05/11] Minor updates per review.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp            | 7 +++----
 .../Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll  | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 757fad1578c3ca..482807d378d128 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3110,10 +3110,9 @@ class BoUpSLP {
       SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
       unsigned NumParts, bool ForOrder = false);
 
-  /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to insert the values from the
-  /// roots. This method calculates the cost of inserting the values.
-  /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
+  /// \returns the cost of gathering (inserting) the values in \p VL into a
+  /// vector.  \param ForPoisonSrc true if initial vector is poison, false
+  /// otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
 
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 7e64b42c52aa94..0c51cb2996dd4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -27,7 +27,7 @@ define void @fun0(ptr nocapture %0, double %1) {
 
 ; This function needs the element-load to be recognized in SystemZ
 ; getVectorInstrCost().
-define void @fun1(double %0) local_unnamed_addr {
+define void @fun1(double %0) {
 ; CHECK-LABEL: define void @fun1(
 ; CHECK:    fsub <2 x double>
 ; CHECK:    fsub <2 x double>

>From 16e135476af5559f87b737244caac70903d852f3 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 15:38:09 +0200
Subject: [PATCH 06/11] Some more minor updates.

---
 .../SystemZ/SystemZTargetTransformInfo.cpp      | 12 +++++-------
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp |  4 ++--
 .../SLPVectorizer/SystemZ/vec-elt-insertion.ll  | 17 ++++++++++++++++-
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index efea2713ea2fd7..e2f375f19cfeb7 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -468,11 +468,9 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
-InstructionCost SystemZTTIImpl::
-getScalarizationOverhead(VectorType *Ty,
-                         const APInt &DemandedElts,
-                         bool Insert, bool Extract,
-                         TTI::TargetCostKind CostKind) {
+InstructionCost SystemZTTIImpl::getScalarizationOverhead(
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind) {
   unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
   InstructionCost Cost = 0;
 
@@ -490,8 +488,8 @@ getScalarizationOverhead(VectorType *Ty,
     Insert = false;
   }
 
-  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
-                                          Extract, CostKind);
+  Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                          CostKind);
   return Cost;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 482807d378d128..c43b103141477b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3111,8 +3111,8 @@ class BoUpSLP {
       unsigned NumParts, bool ForOrder = false);
 
   /// \returns the cost of gathering (inserting) the values in \p VL into a
-  /// vector.  \param ForPoisonSrc true if initial vector is poison, false
-  /// otherwise.
+  /// vector.
+  /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                 Type *ScalarTy) const;
 
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 0c51cb2996dd4d..722fdc84463e55 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
+; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
+; RUN:   -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
 ;
 ; Test functions that (at least currently) only gets vectorized if the
 ; insertion cost for an element load is counted as free.
@@ -9,6 +11,11 @@ define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
 ; CHECK:    fmul <2 x double>
 ; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+;
+; REMARK-LABEL: Function: fun0
+; REMARK: Args:
+; REMARK-NEXT: - String:          'SLP vectorized with cost '
+; REMARK-NEXT: - Cost:            '-1'
 
   %3 = fmul double %1, 2.000000e+00
   %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -36,6 +43,11 @@ define void @fun1(double %0) {
 ; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
 ; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
 ; CHECK:    %14 = fcmp olt <2 x double> %13, %2
+;
+; REMARK-LABEL: Function: fun1
+; REMARK: Args:
+; REMARK:      - String:          'SLP vectorized with cost '
+; REMARK-NEXT: - Cost:            '-1'
 
   br label %2
 
@@ -72,6 +84,9 @@ declare double @llvm.fmuladd.f64(double, double, double)
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
 ; CHECK-NOT: store <2 x i64>
+;
+; REMARK-NOT: Function: fun2
+
   %3 = load i64, ptr %0, align 8
   %4 = icmp eq i64 %3, 0
   br i1 %4, label %5, label %6

>From 6cd869ca5631c4b1f9e71400c7aff95e8fd7f8d4 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 18:04:10 +0200
Subject: [PATCH 07/11] Test updated on top of main.

---
 .../SystemZ/vec-elt-insertion.ll              | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 722fdc84463e55..906ad28c37db98 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -9,8 +9,10 @@
 ; getGatherCost().
 define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
-; CHECK:    fmul <2 x double>
-; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK:         fmul <2 x double>
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    call <2 x double> @llvm.sqrt.v2f64(
 ;
 ; REMARK-LABEL: Function: fun0
 ; REMARK: Args:
@@ -36,13 +38,19 @@ define void @fun0(ptr nocapture %0, double %1) {
 ; getVectorInstrCost().
 define void @fun1(double %0) {
 ; CHECK-LABEL: define void @fun1(
-; CHECK:    fsub <2 x double>
-; CHECK:    fsub <2 x double>
-; CHECK:    fsub <2 x double>
-; CHECK:    fmul <2 x double>
-; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK:    %14 = fcmp olt <2 x double> %13, %2
+; CHECK:         phi <2 x double>
+; CHECK-NEXT:    phi <2 x double>
+; CHECK-NEXT:    phi <2 x double>
+; CHECK-NEXT:    fsub <2 x double>
+; CHECK-NEXT:    fsub <2 x double>
+; CHECK-NEXT:    fsub <2 x double>
+; CHECK:         fmul <2 x double>
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT:    fcmp olt <2 x double>
+; CHECK-NEXT:    extractelement <2 x i1>
+; CHECK-NEXT:    extractelement <2 x i1>
+; CHECK-NEXT:    or i1
 ;
 ; REMARK-LABEL: Function: fun1
 ; REMARK: Args:
@@ -83,7 +91,8 @@ declare double @llvm.fmuladd.f64(double, double, double)
 ; which is recognized in SystemZTTImpl::getScalarizationOverhead().
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
-; CHECK-NOT: store <2 x i64>
+; CHECK: store i64
+; CHECK: store i64
 ;
 ; REMARK-NOT: Function: fun2
 

>From a51f4527e988f2d722296caf261d4403bb5dca98 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 18:45:13 +0200
Subject: [PATCH 08/11] Auto-generate test instead

---
 .../SystemZ/vec-elt-insertion.ll              | 65 +++++++++++++------
 1 file changed, 46 insertions(+), 19 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 906ad28c37db98..5d0eaf77a22af3 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
 ; RUN:   -pass-remarks-output=%t | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s
@@ -9,10 +10,19 @@
 ; getGatherCost().
 define void @fun0(ptr nocapture %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
-; CHECK:         fmul <2 x double>
-; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT:    call <2 x double> @llvm.sqrt.v2f64(
+; CHECK-SAME: ptr nocapture [[TMP0:%.*]], double [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 2.000000e+00, double 2.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[TMP6]], <2 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[TMP6]], <2 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd double [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store double [[TMP12]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    ret void
 ;
 ; REMARK-LABEL: Function: fun0
 ; REMARK: Args:
@@ -38,19 +48,26 @@ define void @fun0(ptr nocapture %0, double %1) {
 ; getVectorInstrCost().
 define void @fun1(double %0) {
 ; CHECK-LABEL: define void @fun1(
-; CHECK:         phi <2 x double>
-; CHECK-NEXT:    phi <2 x double>
-; CHECK-NEXT:    phi <2 x double>
-; CHECK-NEXT:    fsub <2 x double>
-; CHECK-NEXT:    fsub <2 x double>
-; CHECK-NEXT:    fsub <2 x double>
-; CHECK:         fmul <2 x double>
-; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT:    call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT:    fcmp olt <2 x double>
-; CHECK-NEXT:    extractelement <2 x i1>
-; CHECK-NEXT:    extractelement <2 x i1>
-; CHECK-NEXT:    or i1
+; CHECK-SAME: double [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[TMP0]], i32 1
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x double> [ <double poison, double undef>, [[TMP1:%.*]] ], [ poison, %[[BB3]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x double> [ zeroinitializer, [[TMP1]] ], [ poison, %[[BB3]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x double> [ zeroinitializer, [[TMP1]] ], [ [[TMP18:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x double> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fsub <2 x double> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr null, align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x double> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x double> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP9]], <2 x double> [[TMP9]], <2 x double> [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <2 x double> [[TMP13]], [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18]] = insertelement <2 x double> poison, double [[TMP10]], i32 1
+; CHECK-NEXT:    br label %[[BB3]]
 ;
 ; REMARK-LABEL: Function: fun1
 ; REMARK: Args:
@@ -91,8 +108,18 @@ declare double @llvm.fmuladd.f64(double, double, double)
 ; which is recognized in SystemZTTImpl::getScalarizationOverhead().
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
-; CHECK: store i64
-; CHECK: store i64
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[BB4:.*]], label %[[BB5:.*]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 24
+; CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 16
+; CHECK-NEXT:    store i64 0, ptr [[TMP7]], align 8
+; CHECK-NEXT:    br label %[[BB4]]
 ;
 ; REMARK-NOT: Function: fun2
 

>From 5f9b9ec149f81f8d4f5941744b709a27214e5585 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 8 Nov 2024 11:31:02 -0600
Subject: [PATCH 09/11] Pass VL to getScalarizationOverhead

---
 .../llvm/Analysis/TargetTransformInfo.h       | 28 ++++----
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  7 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 16 +++--
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |  2 +-
 .../AArch64/AArch64TargetTransformInfo.h      |  3 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  2 +-
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  3 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 21 ++++--
 .../SystemZ/SystemZTargetTransformInfo.h      |  3 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  7 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |  3 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 13 +---
 .../SystemZ/vec-elt-insertion.ll              | 67 ++++++++++++++++---
 14 files changed, 120 insertions(+), 59 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 985ca1532e0149..85310c86865164 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -909,11 +909,11 @@ class TargetTransformInfo {
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
-  /// extracted from vectors.
-  InstructionCost getScalarizationOverhead(VectorType *Ty,
-                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind) const;
+  /// extracted from vectors.  The involved values may be passed in VL if
+  /// Insert is true.
+  InstructionCost getScalarizationOverhead(
+      VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) const;
 
   /// Estimate the overhead of scalarizing an instructions unique
   /// non-constant operands. The (potentially vector) types to use for each of
@@ -2001,10 +2001,10 @@ class TargetTransformInfo::Concept {
                                                   unsigned ScalarOpdIdx) = 0;
   virtual bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
                                                       int ScalarOpdIdx) = 0;
-  virtual InstructionCost getScalarizationOverhead(VectorType *Ty,
-                                                   const APInt &DemandedElts,
-                                                   bool Insert, bool Extract,
-                                                   TargetCostKind CostKind) = 0;
+  virtual InstructionCost
+  getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
+                           bool Insert, bool Extract, TargetCostKind CostKind,
+                           ArrayRef<Value *> VL = std::nullopt) = 0;
   virtual InstructionCost
   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                    ArrayRef<Type *> Tys,
@@ -2582,12 +2582,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.isVectorIntrinsicWithOverloadTypeAtArg(ID, ScalarOpdIdx);
   }
 
-  InstructionCost getScalarizationOverhead(VectorType *Ty,
-                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract,
-                                           TargetCostKind CostKind) override {
+  InstructionCost
+  getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
+                           bool Insert, bool Extract, TargetCostKind CostKind,
+                           ArrayRef<Value *> VL = std::nullopt) override {
     return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
-                                         CostKind);
+                                         CostKind, VL);
   }
   InstructionCost
   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 38aba183f6a173..9450d4bd323619 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -401,10 +401,9 @@ class TargetTransformInfoImplBase {
     return ScalarOpdIdx == -1;
   }
 
-  InstructionCost getScalarizationOverhead(VectorType *Ty,
-                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind) const {
+  InstructionCost getScalarizationOverhead(
+      VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) const {
     return 0;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 98cbb4886642bf..69e1ece31fb2d2 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -777,10 +777,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
-  InstructionCost getScalarizationOverhead(VectorType *InTy,
-                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind) {
+  InstructionCost getScalarizationOverhead(
+      VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) {
+    assert((VL.empty() ||
+            VL.size() == cast<FixedVectorType>(InTy)->getNumElements()) &&
+           "Type does not match the values.");
     /// FIXME: a bitfield is not a reasonable abstraction for talking about
     /// which elements are needed from a scalable vector
     if (isa<ScalableVectorType>(InTy))
@@ -795,9 +797,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
       if (!DemandedElts[i])
         continue;
-      if (Insert)
+      if (Insert) {
+        Value *InsertedVal = VL.size() ? VL[i] : nullptr;
         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
-                                            CostKind, i, nullptr, nullptr);
+                                            CostKind, i, nullptr, InsertedVal);
+      }
       if (Extract)
         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
                                             CostKind, i, nullptr, nullptr);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1fb2b9836de0cc..d4b6c08c5a32b2 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -622,9 +622,9 @@ bool TargetTransformInfo::isVectorIntrinsicWithOverloadTypeAtArg(
 
 InstructionCost TargetTransformInfo::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-    TTI::TargetCostKind CostKind) const {
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) const {
   return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
-                                           CostKind);
+                                           CostKind, VL);
 }
 
 InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d1536a276a9040..919226eb54fa59 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3363,7 +3363,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
 
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-    TTI::TargetCostKind CostKind) {
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
   if (isa<ScalableVectorType>(Ty))
     return InstructionCost::getInvalid();
   if (Ty->getElementType()->isFloatingPointTy())
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 201bc831b816b3..f3f942035bf958 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -423,7 +423,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind);
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = std::nullopt);
 
   /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bbded57bb92ab0..57f635ca6f42a8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -669,7 +669,7 @@ static unsigned isM1OrSmaller(MVT VT) {
 
 InstructionCost RISCVTTIImpl::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-    TTI::TargetCostKind CostKind) {
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
   if (isa<ScalableVectorType>(Ty))
     return InstructionCost::getInvalid();
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6fd36e90a02ddd..ff8c01faaa936b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -149,7 +149,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind);
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = std::nullopt);
 
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index e2f375f19cfeb7..83b42f6d1794d5 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -468,17 +468,28 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 }
 
+static bool isFreeEltLoad(Value *Op) {
+  if (isa<LoadInst>(Op) && Op->hasOneUse()) {
+    const Instruction *UserI = cast<Instruction>(*Op->user_begin());
+    return !isa<StoreInst>(UserI); // Prefer MVC
+  }
+  return false;
+}
+
 InstructionCost SystemZTTIImpl::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-    TTI::TargetCostKind CostKind) {
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
   unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
   InstructionCost Cost = 0;
 
   if (Insert && Ty->isIntOrIntVectorTy(64)) {
-    // VLVGP will insert two GPRs with one instruction.
+    // VLVGP will insert two GPRs with one instruction, while VLE will load
+    // an element directly with no extra cost
+    assert((VL.empty() || VL.size() == NumElts) &&
+           "Type does not match the number of values.");
     InstructionCost CurrVectorCost = 0;
     for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
-      if (DemandedElts[Idx])
+      if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx])))
         ++CurrVectorCost;
       if (Idx % 2 == 1) {
         Cost += std::min(InstructionCost(1), CurrVectorCost);
@@ -489,7 +500,7 @@ InstructionCost SystemZTTIImpl::getScalarizationOverhead(
   }
 
   Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
-                                          CostKind);
+                                          CostKind, VL);
   return Cost;
 }
 
@@ -1142,7 +1153,7 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    Value *Op1) {
   if (Opcode == Instruction::InsertElement) {
     // Vector Element Load.
-    if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+    if (Op1 != nullptr && isFreeEltLoad(Op1))
       return 0;
 
     // vlvgp will insert two grs into a vector register, so count half the
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 80294ada23c3a9..1e4950b365ba5d 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -84,7 +84,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind);
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = std::nullopt);
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 179e29e40614e7..abe70268108963 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4854,10 +4854,9 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
          RegisterFileMoveCost;
 }
 
-InstructionCost
-X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
-                                     bool Insert, bool Extract,
-                                     TTI::TargetCostKind CostKind) {
+InstructionCost X86TTIImpl::getScalarizationOverhead(
+    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
   assert(DemandedElts.getBitWidth() ==
              cast<FixedVectorType>(Ty)->getNumElements() &&
          "Vector size mismatch");
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 36d00cee0d18b5..44d501c3bf8777 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -169,7 +169,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   InstructionCost getScalarizationOverhead(VectorType *Ty,
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind);
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = std::nullopt);
   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
                                             int VF,
                                             const APInt &DemandedDstElts,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c43b103141477b..04755102643364 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13497,17 +13497,10 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
               TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
               I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
     } else {
-      // Add insertion costs for all elements, but not for loads that can be
-      // loaded directly into a vector element for free.
-      APInt FreeEltLoads = APInt::getZero(VL.size());
-      if (TTI->supportsEfficientVectorElementLoadStore())
-        for (unsigned I : seq<unsigned>(VL.size()))
-          if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
-            FreeEltLoads.setBit(I);
-      APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
-      Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
+      Cost = TTI->getScalarizationOverhead(VecTy,
+                                           /*DemandedElts*/ ~ShuffledElements,
                                            /*Insert*/ true,
-                                           /*Extract*/ false, CostKind);
+                                           /*Extract*/ false, CostKind, VL);
     }
   }
   if (DuplicateNonConst)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 5d0eaf77a22af3..85b8157c949f1f 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -6,15 +6,17 @@
 ; Test functions that (at least currently) only gets vectorized if the
 ; insertion cost for an element load is counted as free.
 
+declare double @llvm.fmuladd.f64(double, double, double)
+
 ; This function needs the free element load to be recognized in SLP
 ; getGatherCost().
-define void @fun0(ptr nocapture %0, double %1) {
+define void @fun0(ptr %0, double %1) {
 ; CHECK-LABEL: define void @fun0(
-; CHECK-SAME: ptr nocapture [[TMP0:%.*]], double [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr [[TMP0:%.*]], double [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 2.000000e+00, double 2.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], splat (double 2.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[TMP6]], <2 x double> zeroinitializer)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[TMP6]], <2 x double> [[TMP7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
@@ -43,12 +45,11 @@ define void @fun0(ptr nocapture %0, double %1) {
   ret void
 }
 
-
 ; This function needs the element-load to be recognized in SystemZ
 ; getVectorInstrCost().
 define void @fun1(double %0) {
 ; CHECK-LABEL: define void @fun1(
-; CHECK-SAME: double [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: double [[TMP0:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[TMP0]], i32 1
 ; CHECK-NEXT:    br label %[[BB3:.*]]
 ; CHECK:       [[BB3]]:
@@ -102,13 +103,11 @@ define void @fun1(double %0) {
   br label %2
 }
 
-declare double @llvm.fmuladd.f64(double, double, double)
-
 ; This should *not* be vectorized as the insertion into the vector isn't free,
 ; which is recognized in SystemZTTImpl::getScalarizationOverhead().
 define void @fun2(ptr %0, ptr %Dst) {
 ; CHECK-LABEL: define void @fun2(
-; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[DST:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[BB4:.*]], label %[[BB5:.*]]
@@ -137,3 +136,55 @@ define void @fun2(ptr %0, ptr %Dst) {
   store i64 0, ptr %8, align 8
   br label %5
 }
+
+; This should *not* be vectorized as the load is immediately stored, in which
+; case MVC is preferred.
+define void @fun3(ptr %0)  {
+; CHECK-LABEL: define void @fun3(
+; CHECK-SAME: ptr [[TMP0:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr inttoptr (i64 568 to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 48
+; CHECK-NEXT:    br label %[[BB5:.*]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    store ptr null, ptr [[TMP3]], align 8, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr inttoptr (i64 64 to ptr), align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call i64 [[TMP0]](ptr noundef poison, i64 noundef poison)
+; CHECK-NEXT:    br label %[[BB5]]
+;
+  %2 = load ptr, ptr inttoptr (i64 568 to ptr), align 8
+  %3 = getelementptr inbounds nuw i8, ptr %2, i64 40
+  %4 = getelementptr inbounds nuw i8, ptr %2, i64 48
+  br label %5
+
+5:
+  store ptr null, ptr %3, align 8, !tbaa !1
+  %6 = load ptr, ptr inttoptr (i64 64 to ptr), align 8, !tbaa !9
+  store ptr %6, ptr %4, align 8
+  %7 = tail call i64 %0(ptr noundef poison, i64 noundef poison)
+  br label %5
+}
+
+!1 = !{!2, !7, i64 40}
+!2 = !{!"arc", !3, i64 0, !6, i64 8, !7, i64 16, !7, i64 24, !8, i64 32, !7, i64 40, !7, i64 48, !6, i64 56, !6, i64 64}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!"long", !4, i64 0}
+!7 = !{!"any pointer", !4, i64 0}
+!8 = !{!"short", !4, i64 0}
+!9 = !{!10, !7, i64 64}
+!10 = !{!"node", !6, i64 0, !3, i64 8, !7, i64 16, !7, i64 24, !7, i64 32, !7, i64 40, !7, i64 48, !7, i64 56, !7, i64 64, !7, i64 72, !6, i64 80, !6, i64 88, !3, i64 96, !3, i64 100}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META6:![0-9]+]], i64 40}
+; CHECK: [[META1]] = !{!"arc", [[META2:![0-9]+]], i64 0, [[META5:![0-9]+]], i64 8, [[META6]], i64 16, [[META6]], i64 24, [[META7:![0-9]+]], i64 32, [[META6]], i64 40, [[META6]], i64 48, [[META5]], i64 56, [[META5]], i64 64}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[META5]] = !{!"long", [[META3]], i64 0}
+; CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0}
+; CHECK: [[META7]] = !{!"short", [[META3]], i64 0}
+; CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META6]], i64 64}
+; CHECK: [[META9]] = !{!"node", [[META5]], i64 0, [[META2]], i64 8, [[META6]], i64 16, [[META6]], i64 24, [[META6]], i64 32, [[META6]], i64 40, [[META6]], i64 48, [[META6]], i64 56, [[META6]], i64 64, [[META6]], i64 72, [[META5]], i64 80, [[META5]], i64 88, [[META2]], i64 96, [[META2]], i64 100}
+;.

>From d8269a345a5c9a63b4fe5ef690b860e9377174a5 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 29 Nov 2024 12:30:17 -0600
Subject: [PATCH 10/11] Review nits

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 6 +++---
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h             | 6 ++----
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h | 2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h         | 2 +-
 7 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 85310c86865164..002163c83fc86a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -913,7 +913,7 @@ class TargetTransformInfo {
   /// Insert is true.
   InstructionCost getScalarizationOverhead(
       VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) const;
+      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = {}) const;
 
   /// Estimate the overhead of scalarizing an instructions unique
   /// non-constant operands. The (potentially vector) types to use for each of
@@ -2004,7 +2004,7 @@ class TargetTransformInfo::Concept {
   virtual InstructionCost
   getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                            bool Insert, bool Extract, TargetCostKind CostKind,
-                           ArrayRef<Value *> VL = std::nullopt) = 0;
+                           ArrayRef<Value *> VL = {}) = 0;
   virtual InstructionCost
   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                    ArrayRef<Type *> Tys,
@@ -2585,7 +2585,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   InstructionCost
   getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                            bool Insert, bool Extract, TargetCostKind CostKind,
-                           ArrayRef<Value *> VL = std::nullopt) override {
+                           ArrayRef<Value *> VL = {}) override {
     return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
                                          CostKind, VL);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 9450d4bd323619..9ff45204a0718d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -403,7 +403,7 @@ class TargetTransformInfoImplBase {
 
   InstructionCost getScalarizationOverhead(
       VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) const {
+      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = {}) const {
     return 0;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 69e1ece31fb2d2..e0a404d0492608 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -779,10 +779,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// extracted from vectors.
   InstructionCost getScalarizationOverhead(
       VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
-      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = std::nullopt) {
-    assert((VL.empty() ||
-            VL.size() == cast<FixedVectorType>(InTy)->getNumElements()) &&
-           "Type does not match the values.");
+      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = {}) {
     /// FIXME: a bitfield is not a reasonable abstraction for talking about
     /// which elements are needed from a scalable vector
     if (isa<ScalableVectorType>(InTy))
@@ -790,6 +787,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     auto *Ty = cast<FixedVectorType>(InTy);
 
     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
+           (VL.empty() || VL.size() == Ty->getNumElements()) &&
            "Vector size mismatch");
 
     InstructionCost Cost = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index f3f942035bf958..83b86e31565e49 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -424,7 +424,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
                                            TTI::TargetCostKind CostKind,
-                                           ArrayRef<Value *> VL = std::nullopt);
+                                           ArrayRef<Value *> VL = {});
 
   /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index ff8c01faaa936b..bd90bfed6e2c95 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -150,7 +150,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
                                            TTI::TargetCostKind CostKind,
-                                           ArrayRef<Value *> VL = std::nullopt);
+                                           ArrayRef<Value *> VL = {});
 
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 1e4950b365ba5d..6795da59bf5b16 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -85,7 +85,7 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
                                            TTI::TargetCostKind CostKind,
-                                           ArrayRef<Value *> VL = std::nullopt);
+                                           ArrayRef<Value *> VL = {});
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 44d501c3bf8777..7786616f89aa6e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -170,7 +170,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
                                            const APInt &DemandedElts,
                                            bool Insert, bool Extract,
                                            TTI::TargetCostKind CostKind,
-                                           ArrayRef<Value *> VL = std::nullopt);
+                                           ArrayRef<Value *> VL = {});
   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
                                             int VF,
                                             const APInt &DemandedDstElts,

>From cff0b6a3199113c2cc34340fb334b53bf5a6d2e8 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 29 Nov 2024 13:15:22 -0600
Subject: [PATCH 11/11] clang-format

---
 .../include/llvm/Analysis/TargetTransformInfo.h | 17 ++++++++++-------
 .../llvm/Analysis/TargetTransformInfoImpl.h     |  8 +++++---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h        |  8 +++++---
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 002163c83fc86a..89231e23e388a7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -911,9 +911,11 @@ class TargetTransformInfo {
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.  The involved values may be passed in VL if
   /// Insert is true.
-  InstructionCost getScalarizationOverhead(
-      VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = {}) const;
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {}) const;
 
   /// Estimate the overhead of scalarizing an instructions unique
   /// non-constant operands. The (potentially vector) types to use for each of
@@ -2582,10 +2584,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.isVectorIntrinsicWithOverloadTypeAtArg(ID, ScalarOpdIdx);
   }
 
-  InstructionCost
-  getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
-                           bool Insert, bool Extract, TargetCostKind CostKind,
-                           ArrayRef<Value *> VL = {}) override {
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {}) override {
     return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
                                          CostKind, VL);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 9ff45204a0718d..48ebffff8cbfc2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -401,9 +401,11 @@ class TargetTransformInfoImplBase {
     return ScalarOpdIdx == -1;
   }
 
-  InstructionCost getScalarizationOverhead(
-      VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = {}) const {
+  InstructionCost getScalarizationOverhead(VectorType *Ty,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {}) const {
     return 0;
   }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e0a404d0492608..e22579730a7437 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -777,9 +777,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
-  InstructionCost getScalarizationOverhead(
-      VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
-      TTI::TargetCostKind CostKind, ArrayRef<Value *> VL = {}) {
+  InstructionCost getScalarizationOverhead(VectorType *InTy,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {}) {
     /// FIXME: a bitfield is not a reasonable abstraction for talking about
     /// which elements are needed from a scalable vector
     if (isa<ScalableVectorType>(InTy))