[PATCH] D131028: [AArch64] Fix cost model for FADD vector reduction
Vir Narula via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 4 10:38:25 PDT 2022
virnarula updated this revision to Diff 450056.
virnarula marked an inline comment as done.
virnarula added a comment.
Fix order of commits for tests
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D131028/new/
https://reviews.llvm.org/D131028
Files:
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
Index: llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
===================================================================
--- llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -26,24 +26,24 @@
define void @fast_fp_reductions() {
; CHECK-LABEL: 'fast_fp_reductions'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2735,6 +2735,32 @@
switch (ISD) {
default:
break;
+ case ISD::FADD: {
+ Type *EltType = ValTy->getElementType();
+ if (!EltType->isDoubleTy() && !EltType->isFloatTy() && !EltType->isHalfTy())
+ break;
+
+ unsigned NumVecElts = cast<FixedVectorType>(ValTy)->getNumElements();
+ unsigned RoundedNumVecElts = pow(2, Log2_32_Ceil(NumVecElts));
+ unsigned BitsPerElement = EltType->getScalarSizeInBits();
+ auto VectorRegBitWidth = getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+ unsigned ElementsPerRegister = VectorRegBitWidth / BitsPerElement;
+
+ // While there are more elements than fit into a single register,
+ // we will use element-wise vector adds to reduce the elements.
+ InstructionCost VectorFADD;
+ unsigned ElementsLeft = RoundedNumVecElts;
+ while (ElementsLeft > ElementsPerRegister) {
+ VectorFADD += ElementsLeft / (ElementsPerRegister * 2);
+ ElementsLeft /= 2;
+ }
+
+ // Once the remaining elements fit into a single vector register,
+ // they will be reduced using pairwise adds (faddp).
+ InstructionCost FADDPCosts(Log2_32_Ceil(ElementsLeft));
+
+ return FADDPCosts + VectorFADD;
+ }
case ISD::ADD:
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
return (LT.first - 1) + Entry->Cost;
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D131028.450056.patch
Type: text/x-patch
Size: 8555 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220804/ec0e3ca6/attachment.bin>
More information about the llvm-commits
mailing list