[llvm] 2955cc1 - [ARM] Improve costs for FMin/Max reductions
David Green via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 4 04:49:18 PDT 2023
Author: David Green
Date: 2023-09-04T12:49:13+01:00
New Revision: 2955cc15ff2242b53cce5e0318ed6867f1b06714
URL: https://github.com/llvm/llvm-project/commit/2955cc15ff2242b53cce5e0318ed6867f1b06714
DIFF: https://github.com/llvm/llvm-project/commit/2955cc15ff2242b53cce5e0318ed6867f1b06714.diff
LOG: [ARM] Improve costs for FMin/Max reductions
Similar to the other reductions, this changes the cost of fmin/fmax reductions
under MVE/NEON to perform vector operations until the types need to be
scalarized. The fp16 vectors can perform a VREV+FMIN/FMAX to skip a step of the
reduction, and otherwise need lanewise extract fro the top lanes.
Added:
Modified:
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/lib/Target/ARM/ARMTargetTransformInfo.h
llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 39e0449f668459..be490d48118b6e 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1785,6 +1785,50 @@ ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, ValTy, CostKind);
}
+InstructionCost
+ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind) {
+ EVT ValVT = TLI->getValueType(DL, Ty);
+
+ // In general floating point reductions are a series of elementwise
+ // operations, with free extracts on each step. These are either in-order or
+ // treewise depending on whether that is allowed by the fast math flags.
+ if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
+ ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
+ (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
+ (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
+ unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ unsigned EltSize = ValVT.getScalarSizeInBits();
+ unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
+ InstructionCost VecCost;
+ while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
+ Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
+ IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
+ VecCost += getIntrinsicInstrCost(ICA, CostKind);
+ NumElts /= 2;
+ }
+
+ // For fp16 we need to extract the upper lane elements. MVE can add a
+ // VREV+FMIN/MAX to perform another vector step instead.
+ InstructionCost ExtractCost = 0;
+ if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
+ NumElts == 8) {
+ VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
+ NumElts /= 2;
+ } else if (ValVT.getVectorElementType() == MVT::f16)
+ ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
+
+ IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
+ {Ty->getElementType(), Ty->getElementType()},
+ FMF);
+ return VecCost + ExtractCost +
+ (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
+ }
+
+ return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
+}
+
InstructionCost
ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 588704d5b7e565..bb4b321b530091 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -290,6 +290,10 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
VectorType *ValTy,
TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind);
+
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
index 5bc936e1fe38e6..1565a138b93d3c 100644
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
@@ -294,19 +294,19 @@ define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
define void @reduce_fmax(<16 x float> %va) {
; THRU-LABEL: 'reduce_fmax'
-; THRU-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; THRU-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; LATE-LABEL: 'reduce_fmax'
-; LATE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE-LABEL: 'reduce_fmax'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'reduce_fmax'
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll
index 14b27062eebb66..48edae8c7d1370 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-fminmax.ll
@@ -11,22 +11,22 @@ define void @fmin_strict() {
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-MVEFP-LABEL: 'fmin_strict'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_v2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmin_v4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fmin_v8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmin_v16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmin_v4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmin_v8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %fmin_v4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fmin_v4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
@@ -65,22 +65,22 @@ define void @fmin_unordered() {
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-MVEFP-LABEL: 'fmin_unordered'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmin_v2f16 = call reassoc half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmin_v4f16 = call reassoc half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fmin_v8f16 = call reassoc half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmin_v16f16 = call reassoc half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmin_v2f32 = call reassoc float @llvm.vector.reduce.fmin.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmin_v4f32 = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmin_v8f32 = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmin_v2f64 = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %fmin_v4f64 = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fmin_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
@@ -118,22 +118,22 @@ define void @fmax_strict() {
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-MVEFP-LABEL: 'fmax_strict'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_v2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmax_v4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fmax_v8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmax_v16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmax_v4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmax_v8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %fmax_v4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fmax_v4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
@@ -172,22 +172,22 @@ define void @fmax_unordered() {
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
-; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
+; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
; CHECK-V8-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-MVEFP-LABEL: 'fmax_unordered'
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmax_v2f16 = call reassoc half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmax_v4f16 = call reassoc half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fmax_v8f16 = call reassoc half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fmax_v16f16 = call reassoc half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmax_v2f32 = call reassoc float @llvm.vector.reduce.fmax.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmax_v4f32 = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmax_v8f32 = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fmax_v2f64 = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %fmax_v4f64 = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fmax_v4f128 = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
More information about the llvm-commits
mailing list