[llvm] r293713 - [NVPTX] Compute approx sqrt as 1/rsqrt(x) rather than x*rsqrt(x).
Justin Lebar via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 31 15:08:58 PST 2017
Author: jlebar
Date: Tue Jan 31 17:08:57 2017
New Revision: 293713
URL: http://llvm.org/viewvc/llvm-project?rev=293713&view=rev
Log:
[NVPTX] Compute approx sqrt as 1/rsqrt(x) rather than x*rsqrt(x).
x*rsqrt(x) returns NaN for x == 0, whereas 1/rsqrt(x) returns 0, as
desired.
Verified that the particular nvptx approximate instructions here do in
fact return 0 for x = 0.
Modified:
llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/trunk/test/CodeGen/NVPTX/fast-math.ll
llvm/trunk/test/CodeGen/NVPTX/sqrt-approx.ll
Modified: llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp?rev=293713&r1=293712&r2=293713&view=diff
==============================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp Tue Jan 31 17:08:57 2017
@@ -1080,9 +1080,14 @@ SDValue NVPTXTargetLowering::getSqrtEsti
return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
: Intrinsic::nvvm_sqrt_approx_f);
else {
- // There's no sqrt.approx.f64 instruction, so we emit x * rsqrt(x).
- return DAG.getNode(ISD::FMUL, DL, VT, Operand,
- MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
+ // There's no sqrt.approx.f64 instruction, so we emit
+ // reciprocal(rsqrt(x)). This is faster than
+ // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
+ // x * rsqrt(x).)
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
+ MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
}
}
}
Modified: llvm/trunk/test/CodeGen/NVPTX/fast-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/NVPTX/fast-math.ll?rev=293713&r1=293712&r2=293713&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/NVPTX/fast-math.ll (original)
+++ llvm/trunk/test/CodeGen/NVPTX/fast-math.ll Tue Jan 31 17:08:57 2017
@@ -40,11 +40,11 @@ define float @sqrt_div_fast_ftz(float %a
}
; There are no fast-math or ftz versions of sqrt and div for f64. We use
-; x * rsqrt(x) for sqrt(x), and emit a vanilla divide.
+; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide.
; CHECK-LABEL: sqrt_div_fast_ftz_f64(
; CHECK: rsqrt.approx.f64
-; CHECK: mul.f64
+; CHECK: rcp.approx.ftz.f64
; CHECK: div.rn.f64
define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
%t1 = tail call double @llvm.sqrt.f64(double %a)
Modified: llvm/trunk/test/CodeGen/NVPTX/sqrt-approx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/NVPTX/sqrt-approx.ll?rev=293713&r1=293712&r2=293713&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/NVPTX/sqrt-approx.ll (original)
+++ llvm/trunk/test/CodeGen/NVPTX/sqrt-approx.ll Tue Jan 31 17:08:57 2017
@@ -59,9 +59,11 @@ define float @test_sqrt_ftz(float %a) #0
; CHECK-LABEL test_sqrt64
define double @test_sqrt64(double %a) #0 {
-; There's no sqrt.approx.f64 instruction; we emit x * rsqrt.approx.f64(x).
+; There's no sqrt.approx.f64 instruction; we emit
+; reciprocal(rsqrt.approx.f64(x)). There's no non-ftz approximate reciprocal,
+; so we just use the ftz version.
; CHECK: rsqrt.approx.f64
-; CHECK: mul.f64
+; CHECK: rcp.approx.ftz.f64
%ret = tail call double @llvm.sqrt.f64(double %a)
ret double %ret
}
@@ -70,7 +72,7 @@ define double @test_sqrt64(double %a) #0
define double @test_sqrt64_ftz(double %a) #0 #1 {
; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
; CHECK: rsqrt.approx.f64
-; CHECK: mul.f64
+; CHECK: rcp.approx.ftz.f64
%ret = tail call double @llvm.sqrt.f64(double %a)
ret double %ret
}
More information about the llvm-commits
mailing list