[PATCH] Cost model support for lowered math builtins.

Thu Feb 28 10:46:53 PST 2013

LGTM,

Can you write shorter tests using the cost model analysis pass?

opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s

See for example: test/Analysis/CostModel/X86/arith.ll

Thanks,
Arnold

On Feb 28, 2013, at 12:32 PM, Benjamin Kramer <benny.kra at gmail.com> wrote:

> bkramer added you to the CC list for the revision "Cost model support for lowered math builtins.".
> 
> Hi nadav, paul.redmond, rengolin,
> 
> This patch allows us to compile a function like
> 
> void foo(float *f) {
>  for (unsigned i = 0; i != 1024; ++i)
>    f[i] = floorf(f[i]);
> }
> 
> into roundps if SSE4.1 is available and not vectorize it otherwise.
> 
> http://llvm-reviews.chandlerc.com/D466
> 
> Files:
>  lib/CodeGen/BasicTargetTransformInfo.cpp
>  test/Transforms/LoopVectorize/X86/intrinsic-cost.ll
> 
> Index: lib/CodeGen/BasicTargetTransformInfo.cpp
> ===================================================================
> --- lib/CodeGen/BasicTargetTransformInfo.cpp
> +++ lib/CodeGen/BasicTargetTransformInfo.cpp
> @@ -117,7 +117,6 @@
>   return new BasicTTI(TLI);
> }
> 
> -
> bool BasicTTI::isLegalAddImmediate(int64_t imm) const {
>   return TLI->isLegalAddImmediate(imm);
> }
> @@ -379,22 +378,77 @@
>   return LT.first;
> }
> 
> -unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
> +unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
>                                          ArrayRef<Type *> Tys) const {
> -  // assume that we need to scalarize this intrinsic.
> -  unsigned ScalarizationCost = 0;
> -  unsigned ScalarCalls = 1;
> -  if (RetTy->isVectorTy()) {
> -    ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
> -    ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
> -  }
> -  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
> -    if (Tys[i]->isVectorTy()) {
> -      ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
> +  unsigned ISD = 0;
> +  switch (IID) {
> +  default: {
> +    // Assume that we need to scalarize this intrinsic.
> +    unsigned ScalarizationCost = 0;
> +    unsigned ScalarCalls = 1;
> +    if (RetTy->isVectorTy()) {
> +      ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
>       ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
>     }
> +    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
> +      if (Tys[i]->isVectorTy()) {
> +        ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
> +        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
> +      }
> +    }
> +
> +    return ScalarCalls + ScalarizationCost;
> +  }
> +  // Look for intrinsics that can be lowered directly or turned into a scalar
> +  // intrinsic call.
> +  case Intrinsic::sqrt:    ISD = ISD::FSQRT;  break;
> +  case Intrinsic::sin:     ISD = ISD::FSIN;   break;
> +  case Intrinsic::cos:     ISD = ISD::FCOS;   break;
> +  case Intrinsic::exp:     ISD = ISD::FEXP;   break;
> +  case Intrinsic::exp2:    ISD = ISD::FEXP2;  break;
> +  case Intrinsic::log:     ISD = ISD::FLOG;   break;
> +  case Intrinsic::log10:   ISD = ISD::FLOG10; break;
> +  case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
> +  case Intrinsic::fabs:    ISD = ISD::FABS;   break;
> +  case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
> +  case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
> +  case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
> +  case Intrinsic::rint:    ISD = ISD::FRINT;  break;
> +  case Intrinsic::pow:     ISD = ISD::FPOW;   break;
> +  case Intrinsic::fma:     ISD = ISD::FMA;    break;
> +  case Intrinsic::fmuladd: ISD = ISD::FMA;    break; // FIXME: mul + add?
>   }
> -  return ScalarCalls + ScalarizationCost;
> +
> +  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
> +
> +  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
> +    // The operation is legal. Assume it costs 1.
> +    // If the type is split to multiple registers, assume that thre is some
> +    // overhead to this.
> +    // TODO: Once we have extract/insert subvector cost we need to use them.
> +    if (LT.first > 1)
> +      return LT.first * 2;
> +    return LT.first * 1;
> +  }
> +
> +  if (!TLI->isOperationExpand(ISD, LT.second)) {
> +    // If the operation is custom lowered then assume
> +    // thare the code is twice as expensive.
> +    return LT.first * 2;
> +  }
> +
> +  // Else, assume that we need to scalarize this intrinsic. For math builtins
> +  // this will emit a costly libcall, adding call overhead and spills. Make it
> +  // very expensive.
> +  if (RetTy->isVectorTy()) {
> +    unsigned Num = RetTy->getVectorNumElements();
> +    unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(),
> +                                                  Tys);
> +    return 10 * Cost * Num;
> +  }
> +
> +  // This is going to be turned into a library call, make it expensive.
> +  return 10;
> }
> 
> unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
> Index: test/Transforms/LoopVectorize/X86/intrinsic-cost.ll
> ===================================================================
> --- /dev/null
> +++ test/Transforms/LoopVectorize/X86/intrinsic-cost.ll
> @@ -0,0 +1,32 @@
> +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize < %s | FileCheck %s -check-prefix=NO
> +; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=corei7 -loop-vectorize < %s | FileCheck %s -check-prefix=YES
> +
> +define void @test1(float* nocapture %x) nounwind {
> +entry:
> +  br label %for.body
> +
> +for.body:                                         ; preds = %entry, %for.body
> +  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
> +  %arrayidx = getelementptr inbounds float* %x, i64 %indvars.iv
> +  %0 = load float* %arrayidx, align 4
> +  %call = tail call float @ceilf(float %0) nounwind readnone
> +  store float %call, float* %arrayidx, align 4
> +  %indvars.iv.next = add i64 %indvars.iv, 1
> +  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> +  %exitcond = icmp eq i32 %lftr.wideiv, 1024
> +  br i1 %exitcond, label %for.end, label %for.body
> +
> +for.end:                                          ; preds = %for.body
> +  ret void
> +
> +; NO: @test1
> +; NO-NOT: llvm.ceilf
> +; NO: ret void
> +
> +; YES: @test1
> +; YES: llvm.ceil.v4f32
> +; YES: ret void
> +
> +}
> +
> +declare float @ceilf(float) nounwind readnone
> <D466.2.patch>_______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits