[PATCH] Cost model support for lowered math builtins.

Thu Feb 28 10:32:08 PST 2013

bkramer added you to the CC list for the revision "Cost model support for lowered math builtins.".

Hi nadav, paul.redmond, rengolin,

This patch allows us to compile a function like

void foo(float *f) {
  for (unsigned i = 0; i != 1024; ++i)
    f[i] = floorf(f[i]);
}

into roundps if SSE4.1 is available and not vectorize it otherwise.

http://llvm-reviews.chandlerc.com/D466

Files:
  lib/CodeGen/BasicTargetTransformInfo.cpp
  test/Transforms/LoopVectorize/X86/intrinsic-cost.ll

Index: lib/CodeGen/BasicTargetTransformInfo.cpp
===================================================================

--- lib/CodeGen/BasicTargetTransformInfo.cpp
+++ lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -117,7 +117,6 @@
   return new BasicTTI(TLI);
 }
 
-
 bool BasicTTI::isLegalAddImmediate(int64_t imm) const {
   return TLI->isLegalAddImmediate(imm);
 }
@@ -379,22 +378,77 @@
   return LT.first;
 }
 
-unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
+unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                          ArrayRef<Type *> Tys) const {
-  // assume that we need to scalarize this intrinsic.
-  unsigned ScalarizationCost = 0;
-  unsigned ScalarCalls = 1;
-  if (RetTy->isVectorTy()) {
-    ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
-    ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
-  }
-  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-    if (Tys[i]->isVectorTy()) {
-      ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
+  unsigned ISD = 0;
+  switch (IID) {
+  default: {
+    // Assume that we need to scalarize this intrinsic.
+    unsigned ScalarizationCost = 0;
+    unsigned ScalarCalls = 1;
+    if (RetTy->isVectorTy()) {
+      ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
       ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
     }
+    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
+      if (Tys[i]->isVectorTy()) {
+        ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
+        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
+      }
+    }
+
+    return ScalarCalls + ScalarizationCost;
+  }
+  // Look for intrinsics that can be lowered directly or turned into a scalar
+  // intrinsic call.
+  case Intrinsic::sqrt:    ISD = ISD::FSQRT;  break;
+  case Intrinsic::sin:     ISD = ISD::FSIN;   break;
+  case Intrinsic::cos:     ISD = ISD::FCOS;   break;
+  case Intrinsic::exp:     ISD = ISD::FEXP;   break;
+  case Intrinsic::exp2:    ISD = ISD::FEXP2;  break;
+  case Intrinsic::log:     ISD = ISD::FLOG;   break;
+  case Intrinsic::log10:   ISD = ISD::FLOG10; break;
+  case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
+  case Intrinsic::fabs:    ISD = ISD::FABS;   break;
+  case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
+  case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
+  case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
+  case Intrinsic::rint:    ISD = ISD::FRINT;  break;
+  case Intrinsic::pow:     ISD = ISD::FPOW;   break;
+  case Intrinsic::fma:     ISD = ISD::FMA;    break;
+  case Intrinsic::fmuladd: ISD = ISD::FMA;    break; // FIXME: mul + add?
   }
-  return ScalarCalls + ScalarizationCost;
+
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
+
+  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1.
+    // If the type is split to multiple registers, assume that thre is some
+    // overhead to this.
+    // TODO: Once we have extract/insert subvector cost we need to use them.
+    if (LT.first > 1)
+      return LT.first * 2;
+    return LT.first * 1;
+  }
+
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
+    // If the operation is custom lowered then assume
+    // thare the code is twice as expensive.
+    return LT.first * 2;
+  }
+
+  // Else, assume that we need to scalarize this intrinsic. For math builtins
+  // this will emit a costly libcall, adding call overhead and spills. Make it
+  // very expensive.
+  if (RetTy->isVectorTy()) {
+    unsigned Num = RetTy->getVectorNumElements();
+    unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(),
+                                                  Tys);
+    return 10 * Cost * Num;
+  }
+
+  // This is going to be turned into a library call, make it expensive.
+  return 10;
 }
 
 unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
Index: test/Transforms/LoopVectorize/X86/intrinsic-cost.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/X86/intrinsic-cost.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize < %s | FileCheck %s -check-prefix=NO
+; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=corei7 -loop-vectorize < %s | FileCheck %s -check-prefix=YES
+
+define void @test1(float* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %x, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %call = tail call float @ceilf(float %0) nounwind readnone
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; NO: @test1
+; NO-NOT: llvm.ceilf
+; NO: ret void
+
+; YES: @test1
+; YES: llvm.ceil.v4f32
+; YES: ret void
+
+}
+
+declare float @ceilf(float) nounwind readnone
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D466.2.patch
Type: text/x-patch
Size: 5348 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130228/72d452d5/attachment.bin>