[PATCH] R600/SI: Custom lower f64 sqrt

Fri Mar 6 16:10:31 PST 2015

On 03/06/2015 01:35 PM, Tom Stellard wrote:
> This is necessay to get the correct precision.
> ---
>   lib/Target/R600/SIISelLowering.cpp | 65 ++++++++++++++++++++++++++++++++++++++
>   lib/Target/R600/SIISelLowering.h   |  1 +
>   test/CodeGen/R600/fsqrt.ll         |  5 ++-
>   test/CodeGen/R600/rsq.ll           |  5 ++-
>   4 files changed, 74 insertions(+), 2 deletions(-)
I'm not sure we should be trying to implement a compliant sqrt for the 
LLVM intrinsic. The LLVM intrinsic doesn't guarantee the undefined < 0.0 
cases (and in general doesn't guarantee IEEE 754 behavior), but OpenCL 
requires it return NAN. It constant folds to 0 last time I checked, 
which won't really work.

I think llvm.sqrt should still select to the instruction, and only be 
used for OpenCL's native_sqrt. This expanded form should be put into the 
library. I believe I already added the ldexp and other intrinsics for 
this purpose.

Alternatively, we can add an intrinsic for the instruction for 
native_sqrt, and implement the non-native sqrt in the library with a 
bounds check + return NAN for the special cases around the intrinsic. 
The backend then should have an optimization to eliminate the NAN check 
since we know the behavior of the expansion. This is probably the best 
option (and also applies for f32, although I believe the f32 sqrt 
instruction is precise enough for OpenCL as is)
>
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 7d794b8..117f8ab 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -81,6 +81,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
>     setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
>     setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
>   
> +  setOperationAction(ISD::FSQRT, MVT::f64, Custom);
> +
>     // We need to custom lower vector stores from local memory
>     setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
>     setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
> @@ -703,6 +705,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
>       return LowerTrig(Op, DAG);
>     case ISD::SELECT: return LowerSELECT(Op, DAG);
>     case ISD::FDIV: return LowerFDIV(Op, DAG);
> +  case ISD::FSQRT: return LowerFSQRT(Op, DAG);
>     case ISD::STORE: return LowerSTORE(Op, DAG);
>     case ISD::GlobalAddress: {
>       MachineFunction &MF = DAG.getMachineFunction();
> @@ -1195,6 +1198,68 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
>     llvm_unreachable("Unexpected type for fdiv");
>   }
>   
> +SDValue SITargetLowering::LowerFSQRT(SDValue Op, SelectionDAG &DAG) const {
> +  assert(Op.getValueType() == MVT::f64);
> +  SDLoc SL(Op);
> +
> +  SDValue S01 = Op.getOperand(0);
> +
> +  SDValue Zero = DAG.getConstant(0, MVT::i32);
> +
> +  SDValue V01 = DAG.getConstantFP(BitsToDouble(0x1000000000000000), MVT::f64);
> +
> +  SDValue VCC = DAG.getSetCC(SL, MVT::i1, S01, V01, ISD::SETOLT);
> +
> +  SDValue V0 = DAG.getConstant(0x100, MVT::i32);
> +
> +  V0 = DAG.getSelect(SL, MVT::i32, VCC, V0, Zero);
> +
> +  V01 = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, S01, V0);
> +
> +  SDValue V23 = DAG.getNode(AMDGPUISD::RSQ, SL, MVT::f64, V01);
> +
> +  SDValue V45 = DAG.getNode(ISD::FMUL, SL, MVT::f64, V01, V23);
> +
> +  SDValue Half = DAG.getConstantFP(0.5, MVT::f64);
> +  V23 = DAG.getNode(ISD::FMUL, SL, MVT::f64, V23, Half);
> +
> +  SDValue NegV23 = DAG.getNode(ISD::FNEG, SL, MVT::f64, V23);
> +  SDValue V67 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegV23, V45, Half);
> +
> +  V45 = DAG.getNode(ISD::FMA, SL, MVT::f64, V45, V67, V45);
> +
> +  SDValue NegV45 = DAG.getNode(ISD::FNEG, SL, MVT::f64, V45);
> +  SDValue V89 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegV45, V45, V01);
> +
> +  V23 = DAG.getNode(ISD::FMA, SL, MVT::f64, V23, V67, V23);
> +
> +  V45 = DAG.getNode(ISD::FMA, SL, MVT::f64, V89, V23, V45);
> +
> +  NegV45 = DAG.getNode(ISD::FNEG, SL, MVT::f64, V45);
> +  V67 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegV45, V45, V01);
> +
> +  SDValue V8 = DAG.getConstant(0xffffff80, MVT::i32);
> +
> +  SDValue V910 = DAG.getConstantFP(BitsToDouble(0x7ff0000000000000), MVT::f64);
> +
> +  V23 = DAG.getNode(ISD::FMA, SL, MVT::f64, V67, V23, V45);
> +
> +  SDValue V4 = DAG.getSelect(SL, MVT::i32, VCC, V8, Zero);
> +
> +  SDValue S89 = DAG.getSetCC(SL, MVT::i1, S01, V910, ISD::SETOEQ);
> +
> +  SDValue DZero = DAG.getConstantFP(0.0, MVT::f64);
> +  VCC = DAG.getSetCC(SL, MVT::i1, S01, DZero, ISD::SETOEQ);
> +
> +  V23 = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, V23, V4);
> +
> +  VCC = DAG.getNode(ISD::OR, SL, MVT::i1, S89, VCC);
> +
> +  V01 = DAG.getSelect(SL, MVT::f64, VCC, V01, V23);
> +
> +  return V01;
> +}
> +
>   SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
>     SDLoc DL(Op);
>     StoreSDNode *Store = cast<StoreSDNode>(Op);
> diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
> index 92f5847..ff04704 100644
> --- a/lib/Target/R600/SIISelLowering.h
> +++ b/lib/Target/R600/SIISelLowering.h
> @@ -36,6 +36,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
>     SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
> +  SDValue LowerFSQRT(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
>     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
> diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
> index 0410134..36274be 100644
> --- a/test/CodeGen/R600/fsqrt.ll
> +++ b/test/CodeGen/R600/fsqrt.ll
> @@ -16,7 +16,10 @@ define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
>   }
>   
>   ; CHECK: {{^}}fsqrt_f64:
> -; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
> +; CHECK-NOT: v_sqrt_f64
> +; CHECK: v_ldexp_f64
> +; CHECK: v_rsq_f64
> +; CHECK: v_ldexp_f64
>   
>   define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
>      %r0 = load double, double addrspace(1)* %in
> diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
> index b67b800..f9e4e15 100644
> --- a/test/CodeGen/R600/rsq.ll
> +++ b/test/CodeGen/R600/rsq.ll
> @@ -18,7 +18,10 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali
>   
>   ; SI-LABEL: {{^}}rsq_f64:
>   ; SI-UNSAFE: v_rsq_f64_e32
> -; SI-SAFE: v_sqrt_f64_e32
> +; SI-SAFE-NOT: v_sqrt_f64
> +; SI-SAFE: v_ldexp_f64
> +; SI-SAFE: v_rsq_f64
> +; SI-SAFE: v_ldexp_f64
>   ; SI: s_endpgm
>   define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
>     %val = load double, double addrspace(1)* %in, align 4
> -- 2.0.4 _______________________________________________ llvm-commits 
> mailing list llvm-commits at cs.uiuc.edu 
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150306/920472c4/attachment.html>