[llvm] f2c25c7 - [CostModel] remove cost-kind predicate for some vector reduction costs

Sun Oct 25 12:21:45 PDT 2020

Author: Sanjay Patel
Date: 2020-10-25T15:17:52-04:00
New Revision: f2c25c70791de95d2466e09b5b58fc37f6ccd7a4

URL: https://github.com/llvm/llvm-project/commit/f2c25c70791de95d2466e09b5b58fc37f6ccd7a4
DIFF: https://github.com/llvm/llvm-project/commit/f2c25c70791de95d2466e09b5b58fc37f6ccd7a4.diff

LOG: [CostModel] remove cost-kind predicate for some vector reduction costs

This is a modified 2nd try of 22d10b8ab44f
(reverted by 1c8371692d because it managed
to expose an existing crashing bug that should be fixed by
74ffc823 ).

Original commit message:

This is similar in spirit to 01ea93d85d6e (memcpy) except that
here the underlying caller assumptions were created for vectorizer
use (throughput) rather than other passes.

That meant targets could have an enormous throughput cost with no
corresponding size, latency, or blended cost increase.
The ARM costs show a small difference between throughput and
size because there's an underlying difference in cmp/sel
costs that is also predicated on cost-kind.

Paraphrasing from the previous commits:
This may not make sense for some callers, but at least now the
costs will be consistently wrong instead of mysteriously wrong.

Targets should provide better overrides if the current modeling
is not accurate.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
    llvm/test/Analysis/CostModel/ARM/reduce-add.ll
    llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index d12e9257e5a7..c21e0d280d6d 100644

--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1200,9 +1200,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::vector_reduce_fmin:
     case Intrinsic::vector_reduce_umax:
     case Intrinsic::vector_reduce_umin: {
-      // FIXME: all cost kinds should default to the same thing?
-      if (CostKind != TTI::TCK_RecipThroughput)
-        return BaseT::getIntrinsicInstrCost(ICA, CostKind);
       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
     }

diff  --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
index bffaa98c82aa..ea9b6a07f4e4 100644
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
@@ -213,11 +213,11 @@ define void @reduce_fmax(<16 x float> %va) {
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fmax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 620 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'reduce_fmax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 620 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)

diff  --git a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
index 2564c1e456c1..b3cc0adf7460 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-add.ll
@@ -22,19 +22,19 @@ define i32 @reduce_i64(i32 %arg) {
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'reduce_i64'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'reduce_i64'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
@@ -67,23 +67,23 @@ define i32 @reduce_i32(i32 %arg) {
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'reduce_i32'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 262 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'reduce_i32'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 681 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1066 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %V2   = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)

diff  --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index d3bf703513eb..1bedeb7c22d1 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -213,11 +213,11 @@ define void @reduce_fmax(<16 x float> %va) {
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fmax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'reduce_fmax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)