[llvm] [SystemZ] SLP reductions: cost functions of reductions and scalarization (PR #112491)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 29 03:18:44 PDT 2024
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/112491
>From 8d81f17934b4609924484a562828ac3d36965a9c Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:51:20 +0200
Subject: [PATCH 1/8] Improvements to vector elements insertion costs.
---
.../SystemZ/SystemZTargetTransformInfo.cpp | 67 ++++++++++++++-----
.../SystemZ/SystemZTargetTransformInfo.h | 4 ++
.../Transforms/Vectorize/SLPVectorizer.cpp | 14 +++-
.../SystemZ/vec-elt-insertion.ll | 66 ++++--------------
4 files changed, 79 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 7e5728c40950ad..b98db455c2dd42 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,6 +469,33 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
}
+InstructionCost SystemZTTIImpl::
+getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind) {
+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ InstructionCost Cost = 0;
+
+ if (Insert && Ty->isIntOrIntVectorTy(64)) {
+ // VLVGP will insert two GPRs with one instruction.
+ InstructionCost CurrVectorCost = 0;
+ for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+ if (DemandedElts[Idx])
+ ++CurrVectorCost;
+ if (Idx % 2 == 1) {
+ Cost += std::min(InstructionCost(1), CurrVectorCost);
+ CurrVectorCost = 0;
+ }
+ }
+ Insert = false;
+ }
+
+ Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
+ Extract, CostKind);
+ return Cost;
+}
+
// Return the bit size for the scalar type or vector element
// type. getScalarSizeInBits() returns 0 for a pointer type.
static unsigned getScalarSizeInBits(Type *Ty) {
@@ -610,7 +637,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
if (DivRemConst) {
SmallVector<Type *> Tys(Args.size(), Ty);
return VF * DivMulSeqCost +
- getScalarizationOverhead(VTy, Args, Tys, CostKind);
+ BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
}
if ((SignedDivRem || UnsignedDivRem) && VF > 4)
// Temporary hack: disable high vectorization factors with integer
@@ -637,7 +664,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
SmallVector<Type *> Tys(Args.size(), Ty);
InstructionCost Cost =
(VF * ScalarCost) +
- getScalarizationOverhead(VTy, Args, Tys, CostKind);
+ BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
// FIXME: VF 2 for these FP operations are currently just as
// expensive as for VF 4.
if (VF == 2)
@@ -655,8 +682,9 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
// There is no native support for FRem.
if (Opcode == Instruction::FRem) {
SmallVector<Type *> Tys(Args.size(), Ty);
- InstructionCost Cost = (VF * LIBCALL_COST) +
- getScalarizationOverhead(VTy, Args, Tys, CostKind);
+ InstructionCost Cost =
+ (VF * LIBCALL_COST) +
+ BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
if (VF == 2 && ScalarBits == 32)
Cost *= 2;
@@ -976,10 +1004,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
NeedsExtracts = false;
- TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
- NeedsExtracts, CostKind);
- TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
- /*Extract*/ false, CostKind);
+ TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+ NeedsExtracts, CostKind);
+ TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
+ /*Extract*/ false, CostKind);
// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -991,8 +1019,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
if (Opcode == Instruction::FPTrunc) {
if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
return VF /*ldxbr/lexbr*/ +
- getScalarizationOverhead(DstVecTy, /*Insert*/ true,
- /*Extract*/ false, CostKind);
+ BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
+ /*Extract*/ false, CostKind);
else // double -> float
return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
}
@@ -1005,8 +1033,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
return VF * 2;
}
// -> fp128. VF * lxdb/lxeb + extraction of elements.
- return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
- /*Extract*/ true, CostKind);
+ return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
+ /*Extract*/ true, CostKind);
}
}
@@ -1115,10 +1143,17 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0,
Value *Op1) {
- // vlvgp will insert two grs into a vector register, so only count half the
- // number of instructions.
- if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
- return ((Index % 2 == 0) ? 1 : 0);
+ if (Opcode == Instruction::InsertElement) {
+ // Vector Element Load.
+ if (Op1 != nullptr && Op1->hasOneUse() && isa<LoadInst>(Op1))
+ return 0;
+
+ // vlvgp will insert two grs into a vector register, so count half the
+ // number of instructions as an estimate when we don't have the full
+ // picture (as in getScalarizationOverhead()).
+ if (Val->isIntOrIntVectorTy(64))
+ return ((Index % 2 == 0) ? 1 : 0);
+ }
if (Opcode == Instruction::ExtractElement) {
int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 8cc71a6c528f82..80294ada23c3a9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -81,6 +81,10 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool prefersVectorizedAddressing() { return false; }
bool LSRWithInstrQueries() { return true; }
+ InstructionCost getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind);
bool supportsEfficientVectorElementLoadStore() { return true; }
bool enableInterleavedAccessVectorization() { return true; }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 268546fe99e138..d1fd3db0f0da8d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3033,8 +3033,8 @@ class BoUpSLP {
unsigned NumParts, bool ForOrder = false);
/// \returns the scalarization cost for this list of values. Assuming that
- /// this subtree gets vectorized, we may need to extract the values from the
- /// roots. This method calculates the cost of extracting the values.
+ /// this subtree gets vectorized, we may need to insert the values from the
+ /// roots. This method calculates the cost of inserting the values.
/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
Type *ScalarTy) const;
@@ -13018,7 +13018,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
} else {
- Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+ // Add insertion costs for all elements, but not for loads that can be
+ // loaded directly into a vector element for free.
+ APInt FreeEltLoads = APInt::getZero(VL.size());
+ if (TTI->supportsEfficientVectorElementLoadStore())
+ for (unsigned I = 0, E = VL.size(); I < E; ++I)
+ if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
+ FreeEltLoads.setBit(I);
+ APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
+ Cost = TTI->getScalarizationOverhead(VecTy, DemandedElts,
/*Insert*/ true,
/*Extract*/ false, CostKind);
}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index eb8dd72e0304d9..7e64b42c52aa94 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,8 +1,4 @@
-; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
-; RUN: -pass-remarks-output=%t | FileCheck %s
-; RUN: cat %t | FileCheck -check-prefix=REMARK %s
-;
-; NB! This is a pre-commit version (for #112491) with current codegen and remarks.
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
;
; Test functions that (at least currently) only gets vectorized if the
; insertion cost for an element load is counted as free.
@@ -11,19 +7,8 @@
; getGatherCost().
define void @fun0(ptr nocapture %0, double %1) {
; CHECK-LABEL: define void @fun0(
-; CHECK: fmul double
-; CHECK: call double @llvm.fmuladd.f64(
-; CHECK-NEXT: call double @llvm.fmuladd.f64(
-; CHECK-NEXT: call double @llvm.sqrt.f64(
-; CHECK: fmul double
-; CHECK: call double @llvm.fmuladd.f64(
-; CHECK-NEXT: call double @llvm.fmuladd.f64(
-; CHECK-NEXT: call double @llvm.sqrt.f64(
-;
-; REMARK-LABEL: Function: fun0
-; REMARK: Args:
-; REMARK-NEXT: - String: 'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost: '0'
+; CHECK: fmul <2 x double>
+; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
%3 = fmul double %1, 2.000000e+00
%4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -39,36 +24,18 @@ define void @fun0(ptr nocapture %0, double %1) {
ret void
}
+
; This function needs the element-load to be recognized in SystemZ
; getVectorInstrCost().
-define void @fun1(double %0) {
+define void @fun1(double %0) local_unnamed_addr {
; CHECK-LABEL: define void @fun1(
-; CHECK: phi double
-; CHECK-NEXT: phi double
-; CHECK-NEXT: phi double
-; CHECK-NEXT: phi double
-; CHECK-NEXT: phi double
-; CHECK-NEXT: phi double
-; CHECK-NEXT: fsub double
-; CHECK-NEXT: fsub double
-; CHECK-NEXT: fmul double
-; CHECK-NEXT: fmul double
-; CHECK-NEXT: fsub double
-; CHECK-NEXT: fsub double
-; CHECK-NEXT: call double @llvm.fmuladd.f64(
-; CHECK-NEXT: call double @llvm.fmuladd.f64(
-; CHECK-NEXT: fsub double
-; CHECK-NEXT: fsub double
-; CHECK-NEXT: call double @llvm.fmuladd.f64(
-; CHECK-NEXT: call double @llvm.fmuladd.f64(
-; CHECK: fcmp olt double
-; CHECK-NEXT: fcmp olt double
-; CHECK-NEXT: or i1
-;
-; REMARK-LABEL: Function: fun1
-; REMARK: Args:
-; REMARK: - String: 'List vectorization was possible but not beneficial with cost '
-; REMARK-NEXT: - Cost: '0'
+; CHECK: fsub <2 x double>
+; CHECK: fsub <2 x double>
+; CHECK: fsub <2 x double>
+; CHECK: fmul <2 x double>
+; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK: %14 = fcmp olt <2 x double> %13, %2
br label %2
@@ -104,14 +71,7 @@ declare double @llvm.fmuladd.f64(double, double, double)
; which is recognized in SystemZTTImpl::getScalarizationOverhead().
define void @fun2(ptr %0, ptr %Dst) {
; CHECK-LABEL: define void @fun2(
-; CHECK: insertelement
-; CHECK: store <2 x i64>
-;
-; REMARK-LABEL: Function: fun2
-; REMARK: Args:
-; REMARK-NEXT: - String: 'Stores SLP vectorized with cost '
-; REMARK-NEXT: - Cost: '-1'
-
+; CHECK-NOT: store <2 x i64>
%3 = load i64, ptr %0, align 8
%4 = icmp eq i64 %3, 0
br i1 %4, label %5, label %6
>From 5f591b8f7e1073fc806125c81747114da9cf0cb3 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 9 Oct 2024 11:52:37 +0200
Subject: [PATCH 2/8] FP reduction cost functions (for SLP)
---
.../SystemZ/SystemZTargetTransformInfo.cpp | 51 +++
.../SystemZ/SystemZTargetTransformInfo.h | 7 +
.../CostModel/SystemZ/vector-reductions-fp.ll | 131 ++++++
.../SLPVectorizer/SystemZ/reductions-fadd.ll | 188 ++++++++
.../SystemZ/reductions-fmin-fmax.ll | 411 ++++++++++++++++++
.../SLPVectorizer/SystemZ/reductions-fmul.ll | 188 ++++++++
6 files changed, 976 insertions(+)
create mode 100644 llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b98db455c2dd42..9ab5a77280e6f9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1388,6 +1388,57 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
return NumVectorMemOps + NumPermutes;
}
+// EXPERIMENTAL
+static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
+
+InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
+ unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
+ InstructionCost Cost = 0;
+ Cost += NumVec - 1; // Full vector operations.
+ Cost += NumEltsPerVecReg; // Last vector scalar operations.
+ return Cost;
+}
+
+InstructionCost
+SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ std::optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
+ (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
+ unsigned NumVectors = getNumVectorRegs(Ty);
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+ // // EXPERIMENTAL: better to not vectorize small vectors?:
+ // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ // if (NumElts <= REDLIM)
+ // return NumVectors * 8; // => MachineCombiner
+
+ // // EXPERIMENTAL: Return a low cost to enable heavily.
+ // return NumVectors / 2;
+
+ return getFPReductionCost(NumVectors, ScalarBits);
+ }
+
+ return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+}
+
+InstructionCost
+SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind) {
+ if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
+ unsigned NumVectors = getNumVectorRegs(Ty);
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+ // // EXPERIMENTAL: Return a low cost to enable heavily.
+ // return NumVectors / 2;
+
+ return getFPReductionCost(NumVectors, ScalarBits);
+ }
+
+ return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
+}
+
static int
getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 80294ada23c3a9..b65e75ab98814c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -129,6 +129,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
+ InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ std::optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind);
+
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
new file mode 100644
index 00000000000000..055c25298d847e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
+; RUN: -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
+
+define void @fadd_reductions() {
+; Z15-LABEL: 'fadd_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+ %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+ %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+ %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+ %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+ ret void
+}
+
+define void @fast_fadd_reductions() {
+; Z15-LABEL: 'fast_fadd_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+ %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+ %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+ %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+ %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
+
+ ret void
+}
+
+define void @fmul_reductions() {
+; Z15-LABEL: 'fmul_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+ %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+ %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+ %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+ %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+ ret void
+}
+
+define void @fast_fmul_reductions() {
+; Z15-LABEL: 'fast_fmul_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
+ %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
+ %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
+ %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
+ %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
+
+ ret void
+}
+
+define void @fmin_reductions() {
+; Z15-LABEL: 'fmin_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+ %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+ %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+ %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+ %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
+ ret void
+}
+
+define void @fmax_reductions() {
+; Z15-LABEL: 'fmax_reductions'
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+ %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+ %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+ %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+ %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
+ ret void
+}
+
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
+declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
+
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
+declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
+declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
new file mode 100644
index 00000000000000..fa0587f1da931b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN: | FileCheck %s
+;
+; Test vectorization and reassociation of fadd operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fadd_double_4_addends_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT: ret double [[TMP1]]
+;
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn double %1, %0
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %2 = load double, ptr %arrayidx2, align 8
+ %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+ %3 = load double, ptr %arrayidx4, align 8
+ %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+ ret double %add5
+}
+
+define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT: [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT: ret double [[TMP16]]
+;
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+ %1 = load double, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn double %1, %0
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+ %2 = load double, ptr %arrayidx2, align 8
+ %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+ %3 = load double, ptr %arrayidx4, align 8
+ %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+ %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+ %4 = load double, ptr %arrayidx6, align 8
+ %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
+ %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+ %5 = load double, ptr %arrayidx8, align 8
+ %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
+ %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+ %6 = load double, ptr %arrayidx10, align 8
+ %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
+ %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+ %7 = load double, ptr %arrayidx12, align 8
+ %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
+ ret double %add13
+}
+
+define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT: ret float [[TMP32]]
+;
+entry:
+ %0 = load float, ptr %x, align 4
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd reassoc nsz arcp contract afn float %1, %0
+ %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+ %2 = load float, ptr %arrayidx2, align 4
+ %add3 = fadd reassoc nsz arcp contract afn float %add, %2
+ %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+ %3 = load float, ptr %arrayidx4, align 4
+ %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
+ %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+ %4 = load float, ptr %arrayidx6, align 4
+ %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
+ %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
+ %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+ %6 = load float, ptr %arrayidx10, align 4
+ %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
+ %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+ %7 = load float, ptr %arrayidx12, align 4
+ %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
+ %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+ %8 = load float, ptr %arrayidx14, align 4
+ %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
+ %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+ %9 = load float, ptr %arrayidx16, align 4
+ %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
+ %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+ %10 = load float, ptr %arrayidx18, align 4
+ %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
+ %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+ %11 = load float, ptr %arrayidx20, align 4
+ %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
+ %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+ %12 = load float, ptr %arrayidx22, align 4
+ %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
+ %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+ %13 = load float, ptr %arrayidx24, align 4
+ %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
+ %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+ %14 = load float, ptr %arrayidx26, align 4
+ %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
+ %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+ %15 = load float, ptr %arrayidx28, align 4
+ %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
+ ret float %add29
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
new file mode 100644
index 00000000000000..5a466178ba786b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN: | FileCheck %s
+
+; Test vectorization and reassociation of fmin/fmax operations. Vectorization
+; is more profitable if the loads are also vectorizable.
+
+define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT: ret double [[TMP2]]
+;
+ %g1 = getelementptr inbounds double, ptr %x, i64 1
+ %g2 = getelementptr inbounds double, ptr %x, i64 2
+ %g3 = getelementptr inbounds double, ptr %x, i64 3
+ %t0 = load double, ptr %x, align 4
+ %t1 = load double, ptr %g1, align 4
+ %t2 = load double, ptr %g2, align 4
+ %t3 = load double, ptr %g3, align 4
+ %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+ %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+ %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+ ret double %m3
+}
+
+define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT: [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT: [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT: [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT: [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT: [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT: [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT: [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT: [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT: [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT: [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT: [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT: [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT: [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT: [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT: [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT: [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT: ret double [[TMP17]]
+;
+ %g1 = getelementptr inbounds double, ptr %x, i64 2
+ %g2 = getelementptr inbounds double, ptr %x, i64 4
+ %g3 = getelementptr inbounds double, ptr %x, i64 6
+ %g4 = getelementptr inbounds double, ptr %x, i64 8
+ %g5 = getelementptr inbounds double, ptr %x, i64 10
+ %g6 = getelementptr inbounds double, ptr %x, i64 12
+ %g7 = getelementptr inbounds double, ptr %x, i64 14
+ %g8 = getelementptr inbounds double, ptr %x, i64 16
+ %g9 = getelementptr inbounds double, ptr %x, i64 18
+ %g10 = getelementptr inbounds double, ptr %x, i64 20
+ %g11 = getelementptr inbounds double, ptr %x, i64 22
+ %g12 = getelementptr inbounds double, ptr %x, i64 24
+ %g13 = getelementptr inbounds double, ptr %x, i64 26
+ %g14 = getelementptr inbounds double, ptr %x, i64 28
+ %g15 = getelementptr inbounds double, ptr %x, i64 30
+ %t0 = load double, ptr %x, align 4
+ %t1 = load double, ptr %g1, align 4
+ %t2 = load double, ptr %g2, align 4
+ %t3 = load double, ptr %g3, align 4
+ %t4 = load double, ptr %g4, align 4
+ %t5 = load double, ptr %g5, align 4
+ %t6 = load double, ptr %g6, align 4
+ %t7 = load double, ptr %g7, align 4
+ %t8 = load double, ptr %g8, align 4
+ %t9 = load double, ptr %g9, align 4
+ %t10 = load double, ptr %g10, align 4
+ %t11 = load double, ptr %g11, align 4
+ %t12 = load double, ptr %g12, align 4
+ %t13 = load double, ptr %g13, align 4
+ %t14 = load double, ptr %g14, align 4
+ %t15 = load double, ptr %g15, align 4
+ %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
+ %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
+ %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
+ %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
+ %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
+ %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
+ %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
+ %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
+ %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
+ %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
+ %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
+ %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
+ %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
+ %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
+ %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
+ ret double %m15
+}
+
+define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT: [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT: [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT: [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.minnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.minnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.minnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.minnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT: ret float [[TMP13]]
+;
+ %g1 = getelementptr inbounds float, ptr %x, i64 2
+ %g2 = getelementptr inbounds float, ptr %x, i64 4
+ %g3 = getelementptr inbounds float, ptr %x, i64 6
+ %g4 = getelementptr inbounds float, ptr %x, i64 8
+ %g5 = getelementptr inbounds float, ptr %x, i64 10
+ %g6 = getelementptr inbounds float, ptr %x, i64 12
+ %g7 = getelementptr inbounds float, ptr %x, i64 14
+ %g8 = getelementptr inbounds float, ptr %x, i64 16
+ %g9 = getelementptr inbounds float, ptr %x, i64 18
+ %g10 = getelementptr inbounds float, ptr %x, i64 20
+ %g11 = getelementptr inbounds float, ptr %x, i64 22
+ %t0 = load float, ptr %x, align 4
+ %t1 = load float, ptr %g1, align 4
+ %t2 = load float, ptr %g2, align 4
+ %t3 = load float, ptr %g3, align 4
+ %t4 = load float, ptr %g4, align 4
+ %t5 = load float, ptr %g5, align 4
+ %t6 = load float, ptr %g6, align 4
+ %t7 = load float, ptr %g7, align 4
+ %t8 = load float, ptr %g8, align 4
+ %t9 = load float, ptr %g9, align 4
+ %t10 = load float, ptr %g10, align 4
+ %t11 = load float, ptr %g11, align 4
+ %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
+ %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
+ %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
+ %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
+ %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
+ %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
+ %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
+ %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
+ %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
+ %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
+ %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
+ ret float %m11
+}
+
+define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_4_nums_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
+; CHECK-NEXT: ret double [[TMP2]]
+;
+ %g1 = getelementptr inbounds double, ptr %x, i64 1
+ %g2 = getelementptr inbounds double, ptr %x, i64 2
+ %g3 = getelementptr inbounds double, ptr %x, i64 3
+ %t0 = load double, ptr %x, align 4
+ %t1 = load double, ptr %g1, align 4
+ %t2 = load double, ptr %g2, align 4
+ %t3 = load double, ptr %g3, align 4
+ %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+ %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+ %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+ ret double %m3
+}
+
+define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
+; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
+; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
+; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
+; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
+; CHECK-NEXT: [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
+; CHECK-NEXT: [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
+; CHECK-NEXT: [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
+; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[X]], align 4
+; CHECK-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT: [[T4:%.*]] = load double, ptr [[G4]], align 4
+; CHECK-NEXT: [[T5:%.*]] = load double, ptr [[G5]], align 4
+; CHECK-NEXT: [[T6:%.*]] = load double, ptr [[G6]], align 4
+; CHECK-NEXT: [[T7:%.*]] = load double, ptr [[G7]], align 4
+; CHECK-NEXT: [[T8:%.*]] = load double, ptr [[G8]], align 4
+; CHECK-NEXT: [[T9:%.*]] = load double, ptr [[G9]], align 4
+; CHECK-NEXT: [[T10:%.*]] = load double, ptr [[G10]], align 4
+; CHECK-NEXT: [[T11:%.*]] = load double, ptr [[G11]], align 4
+; CHECK-NEXT: [[T12:%.*]] = load double, ptr [[G12]], align 4
+; CHECK-NEXT: [[T13:%.*]] = load double, ptr [[G13]], align 4
+; CHECK-NEXT: [[T14:%.*]] = load double, ptr [[G14]], align 4
+; CHECK-NEXT: [[T15:%.*]] = load double, ptr [[G15]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
+; CHECK-NEXT: [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
+; CHECK-NEXT: ret double [[TMP17]]
+;
+ %g1 = getelementptr inbounds double, ptr %x, i64 2
+ %g2 = getelementptr inbounds double, ptr %x, i64 4
+ %g3 = getelementptr inbounds double, ptr %x, i64 6
+ %g4 = getelementptr inbounds double, ptr %x, i64 8
+ %g5 = getelementptr inbounds double, ptr %x, i64 10
+ %g6 = getelementptr inbounds double, ptr %x, i64 12
+ %g7 = getelementptr inbounds double, ptr %x, i64 14
+ %g8 = getelementptr inbounds double, ptr %x, i64 16
+ %g9 = getelementptr inbounds double, ptr %x, i64 18
+ %g10 = getelementptr inbounds double, ptr %x, i64 20
+ %g11 = getelementptr inbounds double, ptr %x, i64 22
+ %g12 = getelementptr inbounds double, ptr %x, i64 24
+ %g13 = getelementptr inbounds double, ptr %x, i64 26
+ %g14 = getelementptr inbounds double, ptr %x, i64 28
+ %g15 = getelementptr inbounds double, ptr %x, i64 30
+ %t0 = load double, ptr %x, align 4
+ %t1 = load double, ptr %g1, align 4
+ %t2 = load double, ptr %g2, align 4
+ %t3 = load double, ptr %g3, align 4
+ %t4 = load double, ptr %g4, align 4
+ %t5 = load double, ptr %g5, align 4
+ %t6 = load double, ptr %g6, align 4
+ %t7 = load double, ptr %g7, align 4
+ %t8 = load double, ptr %g8, align 4
+ %t9 = load double, ptr %g9, align 4
+ %t10 = load double, ptr %g10, align 4
+ %t11 = load double, ptr %g11, align 4
+ %t12 = load double, ptr %g12, align 4
+ %t13 = load double, ptr %g13, align 4
+ %t14 = load double, ptr %g14, align 4
+ %t15 = load double, ptr %g15, align 4
+ %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
+ %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
+ %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
+ %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
+ %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
+ %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
+ %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
+ %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
+ %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
+ %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
+ %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
+ %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
+ %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
+ %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
+ %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
+ ret double %m15
+}
+
+define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4
+; CHECK-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4
+; CHECK-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4
+; CHECK-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4
+; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[G8]], align 4
+; CHECK-NEXT: [[T9:%.*]] = load float, ptr [[G9]], align 4
+; CHECK-NEXT: [[T10:%.*]] = load float, ptr [[G10]], align 4
+; CHECK-NEXT: [[T11:%.*]] = load float, ptr [[G11]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
+; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP9]], float [[T8]])
+; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.maxnum.f32(float [[T9]], float [[T10]])
+; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP12]], float [[T11]])
+; CHECK-NEXT: ret float [[TMP13]]
+;
+ %g1 = getelementptr inbounds float, ptr %x, i64 2
+ %g2 = getelementptr inbounds float, ptr %x, i64 4
+ %g3 = getelementptr inbounds float, ptr %x, i64 6
+ %g4 = getelementptr inbounds float, ptr %x, i64 8
+ %g5 = getelementptr inbounds float, ptr %x, i64 10
+ %g6 = getelementptr inbounds float, ptr %x, i64 12
+ %g7 = getelementptr inbounds float, ptr %x, i64 14
+ %g8 = getelementptr inbounds float, ptr %x, i64 16
+ %g9 = getelementptr inbounds float, ptr %x, i64 18
+ %g10 = getelementptr inbounds float, ptr %x, i64 20
+ %g11 = getelementptr inbounds float, ptr %x, i64 22
+ %t0 = load float, ptr %x, align 4
+ %t1 = load float, ptr %g1, align 4
+ %t2 = load float, ptr %g2, align 4
+ %t3 = load float, ptr %g3, align 4
+ %t4 = load float, ptr %g4, align 4
+ %t5 = load float, ptr %g5, align 4
+ %t6 = load float, ptr %g6, align 4
+ %t7 = load float, ptr %g7, align 4
+ %t8 = load float, ptr %g8, align 4
+ %t9 = load float, ptr %g9, align 4
+ %t10 = load float, ptr %g10, align 4
+ %t11 = load float, ptr %g11, align 4
+ %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
+ %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
+ %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
+ %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
+ %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
+ %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
+ %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
+ %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
+ %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
+ %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
+ %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
+ ret float %m11
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
new file mode 100644
index 00000000000000..e08b38c69a840d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN: | FileCheck %s
+
+; Test vectorization and reassociation of fmul operations. If the loads can
+; be vectorized, cases of fewer operands are also profitable to vectorize.
+
+define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_4_factors_seq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
+; CHECK-NEXT: ret double [[TMP1]]
+;
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %mul = fmul reassoc nsz arcp contract afn double %1, %0
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %2 = load double, ptr %arrayidx2, align 8
+ %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+ %3 = load double, ptr %arrayidx4, align 8
+ %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+ ret double %mul5
+}
+
+define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
+; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
+; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
+; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
+; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
+; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
+; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
+; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
+; CHECK-NEXT: [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
+; CHECK-NEXT: ret double [[TMP16]]
+;
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
+ %1 = load double, ptr %arrayidx1, align 8
+ %mul = fmul reassoc nsz arcp contract afn double %1, %0
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
+ %2 = load double, ptr %arrayidx2, align 8
+ %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
+ %3 = load double, ptr %arrayidx4, align 8
+ %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+ %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
+ %4 = load double, ptr %arrayidx6, align 8
+ %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
+ %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
+ %5 = load double, ptr %arrayidx8, align 8
+ %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
+ %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
+ %6 = load double, ptr %arrayidx10, align 8
+ %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
+ %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
+ %7 = load double, ptr %arrayidx12, align 8
+ %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
+ ret double %mul13
+}
+
+define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
+; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
+; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[X]], align 4
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
+; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
+; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
+; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
+; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
+; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
+; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
+; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
+; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
+; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
+; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
+; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
+; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
+; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
+; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
+; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
+; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
+; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
+; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
+; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
+; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
+; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
+; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
+; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
+; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
+; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
+; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
+; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
+; CHECK-NEXT: ret float [[TMP32]]
+;
+entry:
+ %0 = load float, ptr %x, align 4
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul reassoc nsz arcp contract afn float %1, %0
+ %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
+ %2 = load float, ptr %arrayidx2, align 4
+ %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
+ %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
+ %3 = load float, ptr %arrayidx4, align 4
+ %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
+ %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
+ %4 = load float, ptr %arrayidx6, align 4
+ %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
+ %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
+ %5 = load float, ptr %arrayidx8, align 4
+ %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
+ %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
+ %6 = load float, ptr %arrayidx10, align 4
+ %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
+ %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
+ %7 = load float, ptr %arrayidx12, align 4
+ %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
+ %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
+ %8 = load float, ptr %arrayidx14, align 4
+ %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
+ %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
+ %9 = load float, ptr %arrayidx16, align 4
+ %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
+ %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
+ %10 = load float, ptr %arrayidx18, align 4
+ %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
+ %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
+ %11 = load float, ptr %arrayidx20, align 4
+ %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
+ %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
+ %12 = load float, ptr %arrayidx22, align 4
+ %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
+ %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
+ %13 = load float, ptr %arrayidx24, align 4
+ %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
+ %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
+ %14 = load float, ptr %arrayidx26, align 4
+ %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
+ %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
+ %15 = load float, ptr %arrayidx28, align 4
+ %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
+ ret float %mul29
+}
>From 838a980eb4333918342eb04f418acc09849d4fc3 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 21 Oct 2024 17:16:24 +0200
Subject: [PATCH 3/8] Revert "FP reduction cost functions (for SLP)"
Wait with this and first evaluate the scalarization costs separately.
---
.../SystemZ/SystemZTargetTransformInfo.cpp | 51 ---
.../SystemZ/SystemZTargetTransformInfo.h | 7 -
.../CostModel/SystemZ/vector-reductions-fp.ll | 131 ------
.../SLPVectorizer/SystemZ/reductions-fadd.ll | 188 --------
.../SystemZ/reductions-fmin-fmax.ll | 411 ------------------
.../SLPVectorizer/SystemZ/reductions-fmul.ll | 188 --------
6 files changed, 976 deletions(-)
delete mode 100644 llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
delete mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 9ab5a77280e6f9..b98db455c2dd42 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1388,57 +1388,6 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
return NumVectorMemOps + NumPermutes;
}
-// EXPERIMENTAL
-static cl::opt<unsigned> REDLIM("redlim", cl::init(0));
-
-InstructionCost getFPReductionCost(unsigned NumVec, unsigned ScalarBits) {
- unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
- InstructionCost Cost = 0;
- Cost += NumVec - 1; // Full vector operations.
- Cost += NumEltsPerVecReg; // Last vector scalar operations.
- return Cost;
-}
-
-InstructionCost
-SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
- std::optional<FastMathFlags> FMF,
- TTI::TargetCostKind CostKind) {
- if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector() &&
- (Opcode == Instruction::FAdd || Opcode == Instruction::FMul)) {
- unsigned NumVectors = getNumVectorRegs(Ty);
- unsigned ScalarBits = Ty->getScalarSizeInBits();
-
- // // EXPERIMENTAL: better to not vectorize small vectors?:
- // unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
- // if (NumElts <= REDLIM)
- // return NumVectors * 8; // => MachineCombiner
-
- // // EXPERIMENTAL: Return a low cost to enable heavily.
- // return NumVectors / 2;
-
- return getFPReductionCost(NumVectors, ScalarBits);
- }
-
- return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
-}
-
-InstructionCost
-SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
- FastMathFlags FMF,
- TTI::TargetCostKind CostKind) {
- if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) {
- unsigned NumVectors = getNumVectorRegs(Ty);
- unsigned ScalarBits = Ty->getScalarSizeInBits();
-
- // // EXPERIMENTAL: Return a low cost to enable heavily.
- // return NumVectors / 2;
-
- return getFPReductionCost(NumVectors, ScalarBits);
- }
-
- return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
-}
-
static int
getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
const SmallVectorImpl<Type *> &ParamTys) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index b65e75ab98814c..80294ada23c3a9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -129,13 +129,6 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
- InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
- std::optional<FastMathFlags> FMF,
- TTI::TargetCostKind CostKind);
- InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
- FastMathFlags FMF,
- TTI::TargetCostKind CostKind);
-
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
deleted file mode 100644
index 055c25298d847e..00000000000000
--- a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
-; RUN: -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15
-
-define void @fadd_reductions() {
-; Z15-LABEL: 'fadd_reductions'
-; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
- %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
- %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
- %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
- %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
- ret void
-}
-
-define void @fast_fadd_reductions() {
-; Z15-LABEL: 'fast_fadd_reductions'
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
- %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
- %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
- %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
- %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
-
- ret void
-}
-
-define void @fmul_reductions() {
-; Z15-LABEL: 'fmul_reductions'
-; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
- %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
- %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
- %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
- %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
- ret void
-}
-
-define void @fast_fmul_reductions() {
-; Z15-LABEL: 'fast_fmul_reductions'
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
- %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
- %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
- %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
- %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
-
- ret void
-}
-
-define void @fmin_reductions() {
-; Z15-LABEL: 'fmin_reductions'
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
- %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
- %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
- %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
- %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
- ret void
-}
-
-define void @fmax_reductions() {
-; Z15-LABEL: 'fmax_reductions'
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
-; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
- %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
- %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
- %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
- %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
- %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
- ret void
-}
-
-declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)
-
-declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
-declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
-declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
-declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
-declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)
-
-declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)
-
-declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
-declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
-declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
-declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
-declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
deleted file mode 100644
index fa0587f1da931b..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN: | FileCheck %s
-;
-; Test vectorization and reassociation of fadd operations. If the loads can
-; be vectorized, cases of fewer operands are also profitable to vectorize.
-
-define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fadd_double_4_addends_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]])
-; CHECK-NEXT: ret double [[TMP1]]
-;
-entry:
- %0 = load double, ptr %x, align 8
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %1 = load double, ptr %arrayidx1, align 8
- %add = fadd reassoc nsz arcp contract afn double %1, %0
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %2 = load double, ptr %arrayidx2, align 8
- %add3 = fadd reassoc nsz arcp contract afn double %add, %2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
- %3 = load double, ptr %arrayidx4, align 8
- %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
- ret double %add5
-}
-
-define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fadd_double_8_addends_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
-; CHECK-NEXT: [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]])
-; CHECK-NEXT: ret double [[TMP16]]
-;
-entry:
- %0 = load double, ptr %x, align 8
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
- %1 = load double, ptr %arrayidx1, align 8
- %add = fadd reassoc nsz arcp contract afn double %1, %0
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
- %2 = load double, ptr %arrayidx2, align 8
- %add3 = fadd reassoc nsz arcp contract afn double %add, %2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
- %3 = load double, ptr %arrayidx4, align 8
- %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
- %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
- %4 = load double, ptr %arrayidx6, align 8
- %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
- %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
- %5 = load double, ptr %arrayidx8, align 8
- %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
- %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
- %6 = load double, ptr %arrayidx10, align 8
- %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
- %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
- %7 = load double, ptr %arrayidx12, align 8
- %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
- ret double %add13
-}
-
-define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fadd_float_16_addends_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
-; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
-; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
-; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
-; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
-; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
-; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
-; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
-; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
-; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
-; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
-; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
-; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]])
-; CHECK-NEXT: ret float [[TMP32]]
-;
-entry:
- %0 = load float, ptr %x, align 4
- %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
- %1 = load float, ptr %arrayidx1, align 4
- %add = fadd reassoc nsz arcp contract afn float %1, %0
- %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
- %2 = load float, ptr %arrayidx2, align 4
- %add3 = fadd reassoc nsz arcp contract afn float %add, %2
- %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
- %3 = load float, ptr %arrayidx4, align 4
- %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
- %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
- %4 = load float, ptr %arrayidx6, align 4
- %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
- %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
- %5 = load float, ptr %arrayidx8, align 4
- %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
- %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
- %6 = load float, ptr %arrayidx10, align 4
- %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
- %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
- %7 = load float, ptr %arrayidx12, align 4
- %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
- %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
- %8 = load float, ptr %arrayidx14, align 4
- %add15 = fadd reassoc nsz arcp contract afn float %add13, %8
- %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
- %9 = load float, ptr %arrayidx16, align 4
- %add17 = fadd reassoc nsz arcp contract afn float %add15, %9
- %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
- %10 = load float, ptr %arrayidx18, align 4
- %add19 = fadd reassoc nsz arcp contract afn float %add17, %10
- %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
- %11 = load float, ptr %arrayidx20, align 4
- %add21 = fadd reassoc nsz arcp contract afn float %add19, %11
- %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
- %12 = load float, ptr %arrayidx22, align 4
- %add23 = fadd reassoc nsz arcp contract afn float %add21, %12
- %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
- %13 = load float, ptr %arrayidx24, align 4
- %add25 = fadd reassoc nsz arcp contract afn float %add23, %13
- %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
- %14 = load float, ptr %arrayidx26, align 4
- %add27 = fadd reassoc nsz arcp contract afn float %add25, %14
- %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
- %15 = load float, ptr %arrayidx28, align 4
- %add29 = fadd reassoc nsz arcp contract afn float %add27, %15
- ret float %add29
-}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
deleted file mode 100644
index 5a466178ba786b..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll
+++ /dev/null
@@ -1,411 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN: | FileCheck %s
-
-; Test vectorization and reassociation of fmin/fmax operations. Vectorization
-; is more profitable if the loads are also vectorizable.
-
-define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmin_double_4_nums_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]])
-; CHECK-NEXT: ret double [[TMP2]]
-;
- %g1 = getelementptr inbounds double, ptr %x, i64 1
- %g2 = getelementptr inbounds double, ptr %x, i64 2
- %g3 = getelementptr inbounds double, ptr %x, i64 3
- %t0 = load double, ptr %x, align 4
- %t1 = load double, ptr %g1, align 4
- %t2 = load double, ptr %g2, align 4
- %t3 = load double, ptr %g3, align 4
- %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
- %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
- %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
- ret double %m3
-}
-
-define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmin_double_16_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
-; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
-; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
-; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
-; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
-; CHECK-NEXT: [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
-; CHECK-NEXT: [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
-; CHECK-NEXT: [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
-; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[X]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT: [[T4:%.*]] = load double, ptr [[G4]], align 4
-; CHECK-NEXT: [[T5:%.*]] = load double, ptr [[G5]], align 4
-; CHECK-NEXT: [[T6:%.*]] = load double, ptr [[G6]], align 4
-; CHECK-NEXT: [[T7:%.*]] = load double, ptr [[G7]], align 4
-; CHECK-NEXT: [[T8:%.*]] = load double, ptr [[G8]], align 4
-; CHECK-NEXT: [[T9:%.*]] = load double, ptr [[G9]], align 4
-; CHECK-NEXT: [[T10:%.*]] = load double, ptr [[G10]], align 4
-; CHECK-NEXT: [[T11:%.*]] = load double, ptr [[G11]], align 4
-; CHECK-NEXT: [[T12:%.*]] = load double, ptr [[G12]], align 4
-; CHECK-NEXT: [[T13:%.*]] = load double, ptr [[G13]], align 4
-; CHECK-NEXT: [[T14:%.*]] = load double, ptr [[G14]], align 4
-; CHECK-NEXT: [[T15:%.*]] = load double, ptr [[G15]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
-; CHECK-NEXT: [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]])
-; CHECK-NEXT: ret double [[TMP17]]
-;
- %g1 = getelementptr inbounds double, ptr %x, i64 2
- %g2 = getelementptr inbounds double, ptr %x, i64 4
- %g3 = getelementptr inbounds double, ptr %x, i64 6
- %g4 = getelementptr inbounds double, ptr %x, i64 8
- %g5 = getelementptr inbounds double, ptr %x, i64 10
- %g6 = getelementptr inbounds double, ptr %x, i64 12
- %g7 = getelementptr inbounds double, ptr %x, i64 14
- %g8 = getelementptr inbounds double, ptr %x, i64 16
- %g9 = getelementptr inbounds double, ptr %x, i64 18
- %g10 = getelementptr inbounds double, ptr %x, i64 20
- %g11 = getelementptr inbounds double, ptr %x, i64 22
- %g12 = getelementptr inbounds double, ptr %x, i64 24
- %g13 = getelementptr inbounds double, ptr %x, i64 26
- %g14 = getelementptr inbounds double, ptr %x, i64 28
- %g15 = getelementptr inbounds double, ptr %x, i64 30
- %t0 = load double, ptr %x, align 4
- %t1 = load double, ptr %g1, align 4
- %t2 = load double, ptr %g2, align 4
- %t3 = load double, ptr %g3, align 4
- %t4 = load double, ptr %g4, align 4
- %t5 = load double, ptr %g5, align 4
- %t6 = load double, ptr %g6, align 4
- %t7 = load double, ptr %g7, align 4
- %t8 = load double, ptr %g8, align 4
- %t9 = load double, ptr %g9, align 4
- %t10 = load double, ptr %g10, align 4
- %t11 = load double, ptr %g11, align 4
- %t12 = load double, ptr %g12, align 4
- %t13 = load double, ptr %g13, align 4
- %t14 = load double, ptr %g14, align 4
- %t15 = load double, ptr %g15, align 4
- %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0)
- %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1)
- %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2)
- %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3)
- %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4)
- %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5)
- %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6)
- %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7)
- %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8)
- %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9)
- %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10)
- %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11)
- %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12)
- %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13)
- %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14)
- ret double %m15
-}
-
-define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmin_float_12_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4
-; CHECK-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4
-; CHECK-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4
-; CHECK-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4
-; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[G8]], align 4
-; CHECK-NEXT: [[T9:%.*]] = load float, ptr [[G9]], align 4
-; CHECK-NEXT: [[T10:%.*]] = load float, ptr [[G10]], align 4
-; CHECK-NEXT: [[T11:%.*]] = load float, ptr [[G11]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP8]])
-; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.minnum.f32(float [[TMP9]], float [[T8]])
-; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.minnum.f32(float [[T9]], float [[T10]])
-; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.minnum.f32(float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.minnum.f32(float [[TMP12]], float [[T11]])
-; CHECK-NEXT: ret float [[TMP13]]
-;
- %g1 = getelementptr inbounds float, ptr %x, i64 2
- %g2 = getelementptr inbounds float, ptr %x, i64 4
- %g3 = getelementptr inbounds float, ptr %x, i64 6
- %g4 = getelementptr inbounds float, ptr %x, i64 8
- %g5 = getelementptr inbounds float, ptr %x, i64 10
- %g6 = getelementptr inbounds float, ptr %x, i64 12
- %g7 = getelementptr inbounds float, ptr %x, i64 14
- %g8 = getelementptr inbounds float, ptr %x, i64 16
- %g9 = getelementptr inbounds float, ptr %x, i64 18
- %g10 = getelementptr inbounds float, ptr %x, i64 20
- %g11 = getelementptr inbounds float, ptr %x, i64 22
- %t0 = load float, ptr %x, align 4
- %t1 = load float, ptr %g1, align 4
- %t2 = load float, ptr %g2, align 4
- %t3 = load float, ptr %g3, align 4
- %t4 = load float, ptr %g4, align 4
- %t5 = load float, ptr %g5, align 4
- %t6 = load float, ptr %g6, align 4
- %t7 = load float, ptr %g7, align 4
- %t8 = load float, ptr %g8, align 4
- %t9 = load float, ptr %g9, align 4
- %t10 = load float, ptr %g10, align 4
- %t11 = load float, ptr %g11, align 4
- %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0)
- %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1)
- %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2)
- %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3)
- %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4)
- %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5)
- %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6)
- %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7)
- %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8)
- %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9)
- %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10)
- ret float %m11
-}
-
-define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmax_double_4_nums_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]])
-; CHECK-NEXT: ret double [[TMP2]]
-;
- %g1 = getelementptr inbounds double, ptr %x, i64 1
- %g2 = getelementptr inbounds double, ptr %x, i64 2
- %g3 = getelementptr inbounds double, ptr %x, i64 3
- %t0 = load double, ptr %x, align 4
- %t1 = load double, ptr %g1, align 4
- %t2 = load double, ptr %g2, align 4
- %t3 = load double, ptr %g3, align 4
- %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
- %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
- %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
- ret double %m3
-}
-
-define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmax_double_16_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16
-; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18
-; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20
-; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22
-; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24
-; CHECK-NEXT: [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26
-; CHECK-NEXT: [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28
-; CHECK-NEXT: [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30
-; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[X]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT: [[T4:%.*]] = load double, ptr [[G4]], align 4
-; CHECK-NEXT: [[T5:%.*]] = load double, ptr [[G5]], align 4
-; CHECK-NEXT: [[T6:%.*]] = load double, ptr [[G6]], align 4
-; CHECK-NEXT: [[T7:%.*]] = load double, ptr [[G7]], align 4
-; CHECK-NEXT: [[T8:%.*]] = load double, ptr [[G8]], align 4
-; CHECK-NEXT: [[T9:%.*]] = load double, ptr [[G9]], align 4
-; CHECK-NEXT: [[T10:%.*]] = load double, ptr [[G10]], align 4
-; CHECK-NEXT: [[T11:%.*]] = load double, ptr [[G11]], align 4
-; CHECK-NEXT: [[T12:%.*]] = load double, ptr [[G12]], align 4
-; CHECK-NEXT: [[T13:%.*]] = load double, ptr [[G13]], align 4
-; CHECK-NEXT: [[T14:%.*]] = load double, ptr [[G14]], align 4
-; CHECK-NEXT: [[T15:%.*]] = load double, ptr [[G15]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15
-; CHECK-NEXT: [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]])
-; CHECK-NEXT: ret double [[TMP17]]
-;
- %g1 = getelementptr inbounds double, ptr %x, i64 2
- %g2 = getelementptr inbounds double, ptr %x, i64 4
- %g3 = getelementptr inbounds double, ptr %x, i64 6
- %g4 = getelementptr inbounds double, ptr %x, i64 8
- %g5 = getelementptr inbounds double, ptr %x, i64 10
- %g6 = getelementptr inbounds double, ptr %x, i64 12
- %g7 = getelementptr inbounds double, ptr %x, i64 14
- %g8 = getelementptr inbounds double, ptr %x, i64 16
- %g9 = getelementptr inbounds double, ptr %x, i64 18
- %g10 = getelementptr inbounds double, ptr %x, i64 20
- %g11 = getelementptr inbounds double, ptr %x, i64 22
- %g12 = getelementptr inbounds double, ptr %x, i64 24
- %g13 = getelementptr inbounds double, ptr %x, i64 26
- %g14 = getelementptr inbounds double, ptr %x, i64 28
- %g15 = getelementptr inbounds double, ptr %x, i64 30
- %t0 = load double, ptr %x, align 4
- %t1 = load double, ptr %g1, align 4
- %t2 = load double, ptr %g2, align 4
- %t3 = load double, ptr %g3, align 4
- %t4 = load double, ptr %g4, align 4
- %t5 = load double, ptr %g5, align 4
- %t6 = load double, ptr %g6, align 4
- %t7 = load double, ptr %g7, align 4
- %t8 = load double, ptr %g8, align 4
- %t9 = load double, ptr %g9, align 4
- %t10 = load double, ptr %g10, align 4
- %t11 = load double, ptr %g11, align 4
- %t12 = load double, ptr %g12, align 4
- %t13 = load double, ptr %g13, align 4
- %t14 = load double, ptr %g14, align 4
- %t15 = load double, ptr %g15, align 4
- %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0)
- %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1)
- %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2)
- %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3)
- %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4)
- %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5)
- %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6)
- %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7)
- %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8)
- %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9)
- %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10)
- %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11)
- %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12)
- %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13)
- %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14)
- ret double %m15
-}
-
-define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmax_float_12_nums_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4
-; CHECK-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4
-; CHECK-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4
-; CHECK-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4
-; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[G8]], align 4
-; CHECK-NEXT: [[T9:%.*]] = load float, ptr [[G9]], align 4
-; CHECK-NEXT: [[T10:%.*]] = load float, ptr [[G10]], align 4
-; CHECK-NEXT: [[T11:%.*]] = load float, ptr [[G11]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[T1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[T0]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[T2]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[T3]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[T4]], i32 4
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[T5]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[T6]], i32 6
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[T7]], i32 7
-; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP8]])
-; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP9]], float [[T8]])
-; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.maxnum.f32(float [[T9]], float [[T10]])
-; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.maxnum.f32(float [[TMP12]], float [[T11]])
-; CHECK-NEXT: ret float [[TMP13]]
-;
- %g1 = getelementptr inbounds float, ptr %x, i64 2
- %g2 = getelementptr inbounds float, ptr %x, i64 4
- %g3 = getelementptr inbounds float, ptr %x, i64 6
- %g4 = getelementptr inbounds float, ptr %x, i64 8
- %g5 = getelementptr inbounds float, ptr %x, i64 10
- %g6 = getelementptr inbounds float, ptr %x, i64 12
- %g7 = getelementptr inbounds float, ptr %x, i64 14
- %g8 = getelementptr inbounds float, ptr %x, i64 16
- %g9 = getelementptr inbounds float, ptr %x, i64 18
- %g10 = getelementptr inbounds float, ptr %x, i64 20
- %g11 = getelementptr inbounds float, ptr %x, i64 22
- %t0 = load float, ptr %x, align 4
- %t1 = load float, ptr %g1, align 4
- %t2 = load float, ptr %g2, align 4
- %t3 = load float, ptr %g3, align 4
- %t4 = load float, ptr %g4, align 4
- %t5 = load float, ptr %g5, align 4
- %t6 = load float, ptr %g6, align 4
- %t7 = load float, ptr %g7, align 4
- %t8 = load float, ptr %g8, align 4
- %t9 = load float, ptr %g9, align 4
- %t10 = load float, ptr %g10, align 4
- %t11 = load float, ptr %g11, align 4
- %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0)
- %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1)
- %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2)
- %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3)
- %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4)
- %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5)
- %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6)
- %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7)
- %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8)
- %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9)
- %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10)
- ret float %m11
-}
-
-declare float @llvm.minnum.f32(float, float)
-declare double @llvm.minnum.f64(double, double)
-declare float @llvm.maxnum.f32(float, float)
-declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
deleted file mode 100644
index e08b38c69a840d..00000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN: | FileCheck %s
-
-; Test vectorization and reassociation of fmul operations. If the loads can
-; be vectorized, cases of fewer operands are also profitable to vectorize.
-
-define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmul_double_4_factors_seq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]])
-; CHECK-NEXT: ret double [[TMP1]]
-;
-entry:
- %0 = load double, ptr %x, align 8
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %1 = load double, ptr %arrayidx1, align 8
- %mul = fmul reassoc nsz arcp contract afn double %1, %0
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %2 = load double, ptr %arrayidx2, align 8
- %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
- %3 = load double, ptr %arrayidx4, align 8
- %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
- ret double %mul5
-}
-
-define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define double @fmul_double_8_factors_nonseq(
-; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4
-; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6
-; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8
-; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10
-; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8
-; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12
-; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14
-; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7
-; CHECK-NEXT: [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]])
-; CHECK-NEXT: ret double [[TMP16]]
-;
-entry:
- %0 = load double, ptr %x, align 8
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2
- %1 = load double, ptr %arrayidx1, align 8
- %mul = fmul reassoc nsz arcp contract afn double %1, %0
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4
- %2 = load double, ptr %arrayidx2, align 8
- %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6
- %3 = load double, ptr %arrayidx4, align 8
- %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
- %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8
- %4 = load double, ptr %arrayidx6, align 8
- %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
- %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10
- %5 = load double, ptr %arrayidx8, align 8
- %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
- %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12
- %6 = load double, ptr %arrayidx10, align 8
- %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
- %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14
- %7 = load double, ptr %arrayidx12, align 8
- %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
- ret double %mul13
-}
-
-define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) {
-; CHECK-LABEL: define float @fmul_float_16_factors_nonseq(
-; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[X]], align 4
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4
-; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12
-; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14
-; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4
-; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16
-; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18
-; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4
-; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20
-; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22
-; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4
-; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24
-; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26
-; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
-; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28
-; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4
-; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
-; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5
-; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8
-; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9
-; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10
-; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11
-; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12
-; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13
-; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15
-; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]])
-; CHECK-NEXT: ret float [[TMP32]]
-;
-entry:
- %0 = load float, ptr %x, align 4
- %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2
- %1 = load float, ptr %arrayidx1, align 4
- %mul = fmul reassoc nsz arcp contract afn float %1, %0
- %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4
- %2 = load float, ptr %arrayidx2, align 4
- %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
- %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6
- %3 = load float, ptr %arrayidx4, align 4
- %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
- %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8
- %4 = load float, ptr %arrayidx6, align 4
- %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
- %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10
- %5 = load float, ptr %arrayidx8, align 4
- %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
- %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12
- %6 = load float, ptr %arrayidx10, align 4
- %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
- %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14
- %7 = load float, ptr %arrayidx12, align 4
- %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
- %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16
- %8 = load float, ptr %arrayidx14, align 4
- %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8
- %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18
- %9 = load float, ptr %arrayidx16, align 4
- %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9
- %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20
- %10 = load float, ptr %arrayidx18, align 4
- %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10
- %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22
- %11 = load float, ptr %arrayidx20, align 4
- %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11
- %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24
- %12 = load float, ptr %arrayidx22, align 4
- %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12
- %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26
- %13 = load float, ptr %arrayidx24, align 4
- %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13
- %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28
- %14 = load float, ptr %arrayidx26, align 4
- %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14
- %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30
- %15 = load float, ptr %arrayidx28, align 4
- %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15
- ret float %mul29
-}
>From 20ff5d31489bf3e37ee97d6c413c5c18f2bda8f7 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 21 Oct 2024 17:19:29 +0200
Subject: [PATCH 4/8] Cosmetic update per review.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d1fd3db0f0da8d..81849cc0ca8799 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13022,7 +13022,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
// loaded directly into a vector element for free.
APInt FreeEltLoads = APInt::getZero(VL.size());
if (TTI->supportsEfficientVectorElementLoadStore())
- for (unsigned I = 0, E = VL.size(); I < E; ++I)
+ for (unsigned I : seq<unsigned>(VL.size()))
if (VL[I]->hasOneUse() && isa<LoadInst>(VL[I]))
FreeEltLoads.setBit(I);
APInt DemandedElts = ~ShuffledElements & ~FreeEltLoads;
>From cf446ef61f509a08cf68bd7e89c30d58fcb074da Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 22 Oct 2024 14:54:10 +0200
Subject: [PATCH 5/8] Minor updates per review.
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 +++----
.../Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll | 2 +-
2 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 81849cc0ca8799..e4baa471ec123a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3032,10 +3032,9 @@ class BoUpSLP {
SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
unsigned NumParts, bool ForOrder = false);
- /// \returns the scalarization cost for this list of values. Assuming that
- /// this subtree gets vectorized, we may need to insert the values from the
- /// roots. This method calculates the cost of inserting the values.
- /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
+ /// \returns the cost of gathering (inserting) the values in \p VL into a
+ /// vector. \param ForPoisonSrc true if initial vector is poison, false
+ /// otherwise.
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
Type *ScalarTy) const;
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 7e64b42c52aa94..0c51cb2996dd4d 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -27,7 +27,7 @@ define void @fun0(ptr nocapture %0, double %1) {
; This function needs the element-load to be recognized in SystemZ
; getVectorInstrCost().
-define void @fun1(double %0) local_unnamed_addr {
+define void @fun1(double %0) {
; CHECK-LABEL: define void @fun1(
; CHECK: fsub <2 x double>
; CHECK: fsub <2 x double>
>From 0aa4a4ff4e1285c81ab1bc17a91f31b65b038aea Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 15:38:09 +0200
Subject: [PATCH 6/8] Some more minor updates.
---
.../SystemZ/SystemZTargetTransformInfo.cpp | 12 +++++-------
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
.../SLPVectorizer/SystemZ/vec-elt-insertion.ll | 17 ++++++++++++++++-
3 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b98db455c2dd42..0f40ed2be0003a 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -469,11 +469,9 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
}
-InstructionCost SystemZTTIImpl::
-getScalarizationOverhead(VectorType *Ty,
- const APInt &DemandedElts,
- bool Insert, bool Extract,
- TTI::TargetCostKind CostKind) {
+InstructionCost SystemZTTIImpl::getScalarizationOverhead(
+ VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
+ TTI::TargetCostKind CostKind) {
unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
InstructionCost Cost = 0;
@@ -491,8 +489,8 @@ getScalarizationOverhead(VectorType *Ty,
Insert = false;
}
- Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert,
- Extract, CostKind);
+ Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+ CostKind);
return Cost;
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e4baa471ec123a..d938c3caa535f1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3033,8 +3033,8 @@ class BoUpSLP {
unsigned NumParts, bool ForOrder = false);
/// \returns the cost of gathering (inserting) the values in \p VL into a
- /// vector. \param ForPoisonSrc true if initial vector is poison, false
- /// otherwise.
+ /// vector.
+ /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
Type *ScalarTy) const;
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 0c51cb2996dd4d..722fdc84463e55 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-unknown-linux -mcpu=z16 < %s | FileCheck %s
+; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
+; RUN: -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
;
; Test functions that (at least currently) only gets vectorized if the
; insertion cost for an element load is counted as free.
@@ -9,6 +11,11 @@ define void @fun0(ptr nocapture %0, double %1) {
; CHECK-LABEL: define void @fun0(
; CHECK: fmul <2 x double>
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
+;
+; REMARK-LABEL: Function: fun0
+; REMARK: Args:
+; REMARK-NEXT: - String: 'SLP vectorized with cost '
+; REMARK-NEXT: - Cost: '-1'
%3 = fmul double %1, 2.000000e+00
%4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
@@ -36,6 +43,11 @@ define void @fun1(double %0) {
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
; CHECK: %14 = fcmp olt <2 x double> %13, %2
+;
+; REMARK-LABEL: Function: fun1
+; REMARK: Args:
+; REMARK: - String: 'SLP vectorized with cost '
+; REMARK-NEXT: - Cost: '-1'
br label %2
@@ -72,6 +84,9 @@ declare double @llvm.fmuladd.f64(double, double, double)
define void @fun2(ptr %0, ptr %Dst) {
; CHECK-LABEL: define void @fun2(
; CHECK-NOT: store <2 x i64>
+;
+; REMARK-NOT: Function: fun2
+
%3 = load i64, ptr %0, align 8
%4 = icmp eq i64 %3, 0
br i1 %4, label %5, label %6
>From 100278ee2792fb5a88885618df3e37d27f05f2a0 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 18:04:10 +0200
Subject: [PATCH 7/8] Test updated on top of main.
---
.../SystemZ/vec-elt-insertion.ll | 29 ++++++++++++-------
1 file changed, 19 insertions(+), 10 deletions(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 722fdc84463e55..906ad28c37db98 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -9,8 +9,10 @@
; getGatherCost().
define void @fun0(ptr nocapture %0, double %1) {
; CHECK-LABEL: define void @fun0(
-; CHECK: fmul <2 x double>
-; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK: fmul <2 x double>
+; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT: call <2 x double> @llvm.sqrt.v2f64(
;
; REMARK-LABEL: Function: fun0
; REMARK: Args:
@@ -36,13 +38,19 @@ define void @fun0(ptr nocapture %0, double %1) {
; getVectorInstrCost().
define void @fun1(double %0) {
; CHECK-LABEL: define void @fun1(
-; CHECK: fsub <2 x double>
-; CHECK: fsub <2 x double>
-; CHECK: fsub <2 x double>
-; CHECK: fmul <2 x double>
-; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK: call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK: %14 = fcmp olt <2 x double> %13, %2
+; CHECK: phi <2 x double>
+; CHECK-NEXT: phi <2 x double>
+; CHECK-NEXT: phi <2 x double>
+; CHECK-NEXT: fsub <2 x double>
+; CHECK-NEXT: fsub <2 x double>
+; CHECK-NEXT: fsub <2 x double>
+; CHECK: fmul <2 x double>
+; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
+; CHECK-NEXT: fcmp olt <2 x double>
+; CHECK-NEXT: extractelement <2 x i1>
+; CHECK-NEXT: extractelement <2 x i1>
+; CHECK-NEXT: or i1
;
; REMARK-LABEL: Function: fun1
; REMARK: Args:
@@ -83,7 +91,8 @@ declare double @llvm.fmuladd.f64(double, double, double)
; which is recognized in SystemZTTImpl::getScalarizationOverhead().
define void @fun2(ptr %0, ptr %Dst) {
; CHECK-LABEL: define void @fun2(
-; CHECK-NOT: store <2 x i64>
+; CHECK: store i64
+; CHECK: store i64
;
; REMARK-NOT: Function: fun2
>From a9eddd1e5c9562c90ff4e7eafe9ec39a30f7d191 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 25 Oct 2024 18:45:13 +0200
Subject: [PATCH 8/8] Auto-generate test instead
---
.../SystemZ/vec-elt-insertion.ll | 65 +++++++++++++------
1 file changed, 46 insertions(+), 19 deletions(-)
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
index 906ad28c37db98..5d0eaf77a22af3 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
; RUN: -pass-remarks-output=%t | FileCheck %s
; RUN: cat %t | FileCheck -check-prefix=REMARK %s
@@ -9,10 +10,19 @@
; getGatherCost().
define void @fun0(ptr nocapture %0, double %1) {
; CHECK-LABEL: define void @fun0(
-; CHECK: fmul <2 x double>
-; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT: call <2 x double> @llvm.sqrt.v2f64(
+; CHECK-SAME: ptr nocapture [[TMP0:%.*]], double [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 2.000000e+00, double 2.000000e+00>
+; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[TMP6]], <2 x double> zeroinitializer)
+; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[TMP6]], <2 x double> [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = fadd double [[TMP10]], [[TMP11]]
+; CHECK-NEXT: store double [[TMP12]], ptr [[TMP0]], align 8
+; CHECK-NEXT: ret void
;
; REMARK-LABEL: Function: fun0
; REMARK: Args:
@@ -38,19 +48,26 @@ define void @fun0(ptr nocapture %0, double %1) {
; getVectorInstrCost().
define void @fun1(double %0) {
; CHECK-LABEL: define void @fun1(
-; CHECK: phi <2 x double>
-; CHECK-NEXT: phi <2 x double>
-; CHECK-NEXT: phi <2 x double>
-; CHECK-NEXT: fsub <2 x double>
-; CHECK-NEXT: fsub <2 x double>
-; CHECK-NEXT: fsub <2 x double>
-; CHECK: fmul <2 x double>
-; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT: call <2 x double> @llvm.fmuladd.v2f64(
-; CHECK-NEXT: fcmp olt <2 x double>
-; CHECK-NEXT: extractelement <2 x i1>
-; CHECK-NEXT: extractelement <2 x i1>
-; CHECK-NEXT: or i1
+; CHECK-SAME: double [[TMP0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[TMP0]], i32 1
+; CHECK-NEXT: br label %[[BB3:.*]]
+; CHECK: [[BB3]]:
+; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ <double poison, double undef>, [[TMP1:%.*]] ], [ poison, %[[BB3]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x double> [ zeroinitializer, [[TMP1]] ], [ poison, %[[BB3]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ zeroinitializer, [[TMP1]] ], [ [[TMP18:%.*]], %[[BB3]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> zeroinitializer, [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x double> zeroinitializer, [[TMP5]]
+; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x double> zeroinitializer, [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr null, align 8
+; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x double> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x double> [[TMP11]])
+; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP9]], <2 x double> [[TMP9]], <2 x double> [[TMP12]])
+; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <2 x double> [[TMP13]], [[TMP2]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP14]], i32 1
+; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP18]] = insertelement <2 x double> poison, double [[TMP10]], i32 1
+; CHECK-NEXT: br label %[[BB3]]
;
; REMARK-LABEL: Function: fun1
; REMARK: Args:
@@ -91,8 +108,18 @@ declare double @llvm.fmuladd.f64(double, double, double)
; which is recognized in SystemZTTImpl::getScalarizationOverhead().
define void @fun2(ptr %0, ptr %Dst) {
; CHECK-LABEL: define void @fun2(
-; CHECK: store i64
-; CHECK: store i64
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP0]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[TMP3]], label %[[BB4:.*]], label %[[BB5:.*]]
+; CHECK: [[BB4]]:
+; CHECK-NEXT: ret void
+; CHECK: [[BB5]]:
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 24
+; CHECK-NEXT: store i64 [[TMP2]], ptr [[TMP6]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 16
+; CHECK-NEXT: store i64 0, ptr [[TMP7]], align 8
+; CHECK-NEXT: br label %[[BB4]]
;
; REMARK-NOT: Function: fun2
More information about the llvm-commits
mailing list