[llvm-commits] [llvm] r92388 - in /llvm/trunk: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp lib/Target/README.txt test/CodeGen/X86/2007-09-27-LDIntrinsics.ll test/CodeGen/X86/powi.ll
Chris Lattner
sabre at nondot.org
Thu Dec 31 19:32:17 PST 2009
Author: lattner
Date: Thu Dec 31 21:32:16 2009
New Revision: 92388
URL: http://llvm.org/viewvc/llvm-project?rev=92388&view=rev
Log:
Teach codegen to lower llvm.powi to an efficient (but not optimal)
multiply sequence when the power is a constant integer. Before, our
codegen for std::pow(.., int) always turned into a libcall, which was
really inefficient.
This should also make many gfortran programs happier I'd imagine.
Added:
llvm/trunk/test/CodeGen/X86/powi.ll
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/trunk/lib/Target/README.txt
llvm/trunk/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp?rev=92388&r1=92387&r2=92388&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp Thu Dec 31 21:32:16 2009
@@ -462,7 +462,8 @@
// The number of parts is a power of 2. Repeatedly bisect the value using
// EXTRACT_ELEMENT.
Parts[0] = DAG.getNode(ISD::BIT_CONVERT, dl,
- EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()),
+ EVT::getIntegerVT(*DAG.getContext(),
+ ValueVT.getSizeInBits()),
Val);
if (DisableScheduling)
@@ -4261,6 +4262,59 @@
setValue(&I, result);
}
+
+/// ExpandPowI - Expand a llvm.powi intrinsic.
+static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
+ SelectionDAG &DAG) {
+ // If RHS is a constant, we can expand this out to a multiplication tree,
+ // otherwise we end up lowering to a call to __powidf2 (for example). When
+ // optimizing for size, we only want to do this if the expansion would produce
+ // a small number of multiplies, otherwise we do the full expansion.
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ // Get the exponent as a positive value.
+ unsigned Val = RHSC->getSExtValue();
+ if ((int)Val < 0) Val = -Val;
+
+ // powi(x, 0) -> 1.0
+ if (Val == 0)
+ return DAG.getConstantFP(1.0, LHS.getValueType());
+
+ Function *F = DAG.getMachineFunction().getFunction();
+ if (!F->hasFnAttr(Attribute::OptimizeForSize) ||
+ // If optimizing for size, don't insert too many multiplies. This
+ // inserts up to 5 multiplies.
+ CountPopulation_32(Val)+Log2_32(Val) < 7) {
+ // We use the simple binary decomposition method to generate the multiply
+ // sequence. There are more optimal ways to do this (for example,
+ // powi(x,15) generates one more multiply than it should), but this has
+ // the benefit of being both really simple and much better than a libcall.
+ SDValue Res; // Logically starts equal to 1.0
+ SDValue CurSquare = LHS;
+ while (Val) {
+ if (Val & 1)
+ if (Res.getNode())
+ Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+ else
+ Res = CurSquare; // 1.0*CurSquare.
+
+ CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
+ CurSquare, CurSquare);
+ Val >>= 1;
+ }
+
+ // If the original was negative, invert the result, producing 1/(x*x*x).
+ if (RHSC->getSExtValue() < 0)
+ Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
+ DAG.getConstantFP(1.0, LHS.getValueType()), Res);
+ return Res;
+ }
+ }
+
+ // Otherwise, expand to a libcall.
+ return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
+}
+
+
/// visitIntrinsicCall - Lower the call to the specified intrinsic function. If
/// we want to emit this as a call to a named external function, return the name
/// otherwise lower it and return null.
@@ -4536,10 +4590,8 @@
DAG.AssignOrdering(Res.getNode(), SDNodeOrder);
return 0;
case Intrinsic::powi:
- Res = DAG.getNode(ISD::FPOWI, dl,
- getValue(I.getOperand(1)).getValueType(),
- getValue(I.getOperand(1)),
- getValue(I.getOperand(2)));
+ Res = ExpandPowI(dl, getValue(I.getOperand(1)), getValue(I.getOperand(2)),
+ DAG);
setValue(&I, Res);
if (DisableScheduling)
DAG.AssignOrdering(Res.getNode(), SDNodeOrder);
Modified: llvm/trunk/lib/Target/README.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/README.txt?rev=92388&r1=92387&r2=92388&view=diff
==============================================================================
--- llvm/trunk/lib/Target/README.txt (original)
+++ llvm/trunk/lib/Target/README.txt Thu Dec 31 21:32:16 2009
@@ -756,36 +756,6 @@
//===---------------------------------------------------------------------===//
-We generate a horrible libcall for llvm.powi. For example, we compile:
-
-#include <cmath>
-double f(double a) { return std::pow(a, 4); }
-
-into:
-
-__Z1fd:
- subl $12, %esp
- movsd 16(%esp), %xmm0
- movsd %xmm0, (%esp)
- movl $4, 8(%esp)
- call L___powidf2$stub
- addl $12, %esp
- ret
-
-GCC produces:
-
-__Z1fd:
- subl $12, %esp
- movsd 16(%esp), %xmm0
- mulsd %xmm0, %xmm0
- mulsd %xmm0, %xmm0
- movsd %xmm0, (%esp)
- fldl (%esp)
- addl $12, %esp
- ret
-
-//===---------------------------------------------------------------------===//
-
We compile this program: (from GCC PR11680)
http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
Modified: llvm/trunk/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll?rev=92388&r1=92387&r2=92388&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll Thu Dec 31 21:32:16 2009
@@ -1,47 +1,30 @@
-; RUN: llc < %s | grep powixf2
-; RUN: llc < %s | grep fsqrt
-; ModuleID = 'yyy.c'
+; RUN: llc < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i686-apple-darwin8"
-define x86_fp80 @foo(x86_fp80 %x) {
+define x86_fp80 @foo(x86_fp80 %x) nounwind{
entry:
- %x_addr = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
- %retval = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
- %tmp = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
- %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
- store x86_fp80 %x, x86_fp80* %x_addr
- %tmp1 = load x86_fp80* %x_addr, align 16 ; <x86_fp80> [#uses=1]
- %tmp2 = call x86_fp80 @llvm.sqrt.f80( x86_fp80 %tmp1 ) ; <x86_fp80> [#uses=1]
- store x86_fp80 %tmp2, x86_fp80* %tmp, align 16
- %tmp3 = load x86_fp80* %tmp, align 16 ; <x86_fp80> [#uses=1]
- store x86_fp80 %tmp3, x86_fp80* %retval, align 16
- br label %return
-
-return: ; preds = %entry
- %retval4 = load x86_fp80* %retval ; <x86_fp80> [#uses=1]
- ret x86_fp80 %retval4
+ %tmp2 = call x86_fp80 @llvm.sqrt.f80( x86_fp80 %x )
+ ret x86_fp80 %tmp2
+
+; CHECK: foo:
+; CHECK: fldt 4(%esp)
+; CHECK-NEXT: fsqrt
+; CHECK-NEXT: ret
}
declare x86_fp80 @llvm.sqrt.f80(x86_fp80)
-define x86_fp80 @bar(x86_fp80 %x) {
+define x86_fp80 @bar(x86_fp80 %x) nounwind {
entry:
- %x_addr = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
- %retval = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
- %tmp = alloca x86_fp80 ; <x86_fp80*> [#uses=2]
- %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
- store x86_fp80 %x, x86_fp80* %x_addr
- %tmp1 = load x86_fp80* %x_addr, align 16 ; <x86_fp80> [#uses=1]
- %tmp2 = call x86_fp80 @llvm.powi.f80( x86_fp80 %tmp1, i32 3 ) ; <x86_fp80> [#uses=1]
- store x86_fp80 %tmp2, x86_fp80* %tmp, align 16
- %tmp3 = load x86_fp80* %tmp, align 16 ; <x86_fp80> [#uses=1]
- store x86_fp80 %tmp3, x86_fp80* %retval, align 16
- br label %return
-
-return: ; preds = %entry
- %retval4 = load x86_fp80* %retval ; <x86_fp80> [#uses=1]
- ret x86_fp80 %retval4
+ %tmp2 = call x86_fp80 @llvm.powi.f80( x86_fp80 %x, i32 3 )
+ ret x86_fp80 %tmp2
+; CHECK: bar:
+; CHECK: fldt 4(%esp)
+; CHECK-NEXT: fld %st(0)
+; CHECK-NEXT: fmul %st(1)
+; CHECK-NEXT: fmulp %st(1)
+; CHECK-NEXT: ret
}
declare x86_fp80 @llvm.powi.f80(x86_fp80, i32)
Added: llvm/trunk/test/CodeGen/X86/powi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/powi.ll?rev=92388&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/powi.ll (added)
+++ llvm/trunk/test/CodeGen/X86/powi.ll Thu Dec 31 21:32:16 2009
@@ -0,0 +1,11 @@
+; RUN: llc %s -march=x86 -mcpu=yonah -o - | grep mulsd | count 6
+; Ideally this would compile to 5 multiplies.
+
+define double @_Z3f10d(double %a) nounwind readonly ssp noredzone {
+entry:
+ %0 = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
+ ret double %0
+}
+
+declare double @llvm.powi.f64(double, i32) nounwind readonly
+
More information about the llvm-commits
mailing list