<html>

  <head>

    <meta content="text/html; charset=windows-1252"

      http-equiv="Content-Type">

  </head>

  <body bgcolor="#FFFFFF" text="#000000">

    On 03/06/2015 01:35 PM, Tom Stellard wrote:<br>

    <blockquote

      cite="mid:1425677742-16381-1-git-send-email-thomas.stellard@amd.com"

      type="cite">

      <div class="moz-text-plain" wrap="true" graphical-quote="true"

        style="font-family: -moz-fixed; font-size: 12px;"

        lang="x-western">

        <pre wrap="">This is necessay to get the correct precision.

---

 lib/Target/R600/SIISelLowering.cpp | 65 ++++++++++++++++++++++++++++++++++++++

 lib/Target/R600/SIISelLowering.h   |  1 +

 test/CodeGen/R600/fsqrt.ll         |  5 ++-

 test/CodeGen/R600/rsq.ll           |  5 ++-

 4 files changed, 74 insertions(+), 2 deletions(-)</pre>

      </div>

    </blockquote>

    I'm not sure we should be trying to implement a compliant sqrt for

    the LLVM intrinsic. The LLVM intrinsic doesn't guarantee the

    undefined < 0.0 cases (and in general doesn't guarantee IEEE 754

    behavior), but OpenCL requires it return NAN. It constant folds to 0

    last time I checked, which won't really work.<br>

    <br>

    I think llvm.sqrt should still select to the instruction, and only

    be used for OpenCL's native_sqrt. This expanded form should be put

    into the library. I believe I already added the ldexp and other

    intrinsics for this purpose.<br>

    <br>

    Alternatively, we can add an intrinsic for the instruction for

    native_sqrt, and implement the non-native sqrt in the library with a

    bounds check + return NAN for the special cases around the

    intrinsic. The backend then should have an optimization to eliminate

    the NAN check since we know the behavior of the expansion. This is

    probably the best option (and also applies for f32, although I

    believe the f32 sqrt instruction is precise enough for OpenCL as is)<br>

    <blockquote

      cite="mid:1425677742-16381-1-git-send-email-thomas.stellard@amd.com"

      type="cite">

      <div class="moz-text-plain" wrap="true" graphical-quote="true"

        style="font-family: -moz-fixed; font-size: 12px;"

        lang="x-western">

        <pre wrap="">

diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp

index 7d794b8..117f8ab 100644

--- a/lib/Target/R600/SIISelLowering.cpp

+++ b/lib/Target/R600/SIISelLowering.cpp

@@ -81,6 +81,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,

   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);

   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);

+  setOperationAction(ISD::FSQRT, MVT::f64, Custom);

+

   // We need to custom lower vector stores from local memory

   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);

   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);

@@ -703,6 +705,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

     return LowerTrig(Op, DAG);

   case ISD::SELECT: return LowerSELECT(Op, DAG);

   case ISD::FDIV: return LowerFDIV(Op, DAG);

+  case ISD::FSQRT: return LowerFSQRT(Op, DAG);

   case ISD::STORE: return LowerSTORE(Op, DAG);

   case ISD::GlobalAddress: {

     MachineFunction &MF = DAG.getMachineFunction();

@@ -1195,6 +1198,68 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {

   llvm_unreachable("Unexpected type for fdiv");

 }

+SDValue SITargetLowering::LowerFSQRT(SDValue Op, SelectionDAG &DAG) const {

+  assert(Op.getValueType() == MVT::f64);

+  SDLoc SL(Op);

+

+  SDValue S01 = Op.getOperand(0);

+

+  SDValue Zero = DAG.getConstant(0, MVT::i32);

+

+  SDValue V01 = DAG.getConstantFP(BitsToDouble(0x1000000000000000), MVT::f64);

+

+  SDValue VCC = DAG.getSetCC(SL, MVT::i1, S01, V01, ISD::SETOLT);

+

+  SDValue V0 = DAG.getConstant(0x100, MVT::i32);

+

+  V0 = DAG.getSelect(SL, MVT::i32, VCC, V0, Zero);

+

+  V01 = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, S01, V0);

+

+  SDValue V23 = DAG.getNode(AMDGPUISD::RSQ, SL, MVT::f64, V01);

+

+  SDValue V45 = DAG.getNode(ISD::FMUL, SL, MVT::f64, V01, V23);

+

+  SDValue Half = DAG.getConstantFP(0.5, MVT::f64);

+  V23 = DAG.getNode(ISD::FMUL, SL, MVT::f64, V23, Half);

+

+  SDValue NegV23 = DAG.getNode(ISD::FNEG, SL, MVT::f64, V23);

+  SDValue V67 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegV23, V45, Half);

+

+  V45 = DAG.getNode(ISD::FMA, SL, MVT::f64, V45, V67, V45);

+

+  SDValue NegV45 = DAG.getNode(ISD::FNEG, SL, MVT::f64, V45);

+  SDValue V89 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegV45, V45, V01);

+

+  V23 = DAG.getNode(ISD::FMA, SL, MVT::f64, V23, V67, V23);

+

+  V45 = DAG.getNode(ISD::FMA, SL, MVT::f64, V89, V23, V45);

+

+  NegV45 = DAG.getNode(ISD::FNEG, SL, MVT::f64, V45);

+  V67 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegV45, V45, V01);

+

+  SDValue V8 = DAG.getConstant(0xffffff80, MVT::i32);

+

+  SDValue V910 = DAG.getConstantFP(BitsToDouble(0x7ff0000000000000), MVT::f64);

+

+  V23 = DAG.getNode(ISD::FMA, SL, MVT::f64, V67, V23, V45);

+

+  SDValue V4 = DAG.getSelect(SL, MVT::i32, VCC, V8, Zero);

+

+  SDValue S89 = DAG.getSetCC(SL, MVT::i1, S01, V910, ISD::SETOEQ);

+

+  SDValue DZero = DAG.getConstantFP(0.0, MVT::f64);

+  VCC = DAG.getSetCC(SL, MVT::i1, S01, DZero, ISD::SETOEQ);

+

+  V23 = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, V23, V4);

+

+  VCC = DAG.getNode(ISD::OR, SL, MVT::i1, S89, VCC);

+

+  V01 = DAG.getSelect(SL, MVT::f64, VCC, V01, V23);

+

+  return V01;

+}

+

 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {

   SDLoc DL(Op);

   StoreSDNode *Store = cast<StoreSDNode>(Op);

diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h

index 92f5847..ff04704 100644

--- a/lib/Target/R600/SIISelLowering.h

+++ b/lib/Target/R600/SIISelLowering.h

@@ -36,6 +36,7 @@ class SITargetLowering : public AMDGPUTargetLowering {

   SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;

   SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;

   SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;

+  SDValue LowerFSQRT(SDValue Op, SelectionDAG &DAG) const;

   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;

   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;

   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;

diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll

index 0410134..36274be 100644

--- a/test/CodeGen/R600/fsqrt.ll

+++ b/test/CodeGen/R600/fsqrt.ll

@@ -16,7 +16,10 @@ define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {

 }

 ; CHECK: {{^}}fsqrt_f64:

-; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}

+; CHECK-NOT: v_sqrt_f64

+; CHECK: v_ldexp_f64

+; CHECK: v_rsq_f64

+; CHECK: v_ldexp_f64

 define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {

    %r0 = load double, double addrspace(1)* %in

diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll

index b67b800..f9e4e15 100644

--- a/test/CodeGen/R600/rsq.ll

+++ b/test/CodeGen/R600/rsq.ll

@@ -18,7 +18,10 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali

 ; SI-LABEL: {{^}}rsq_f64:

 ; SI-UNSAFE: v_rsq_f64_e32

-; SI-SAFE: v_sqrt_f64_e32

+; SI-SAFE-NOT: v_sqrt_f64

+; SI-SAFE: v_ldexp_f64

+; SI-SAFE: v_rsq_f64

+; SI-SAFE: v_ldexp_f64

 ; SI: s_endpgm

 define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {

   %val = load double, double addrspace(1)* %in, align 4

<div class="moz-txt-sig">-- 

2.0.4

_______________________________________________

llvm-commits mailing list

<a moz-do-not-send="true" class="moz-txt-link-abbreviated" href="mailto:llvm-commits@cs.uiuc.edu">llvm-commits@cs.uiuc.edu</a>

<a moz-do-not-send="true" class="moz-txt-link-freetext" href="http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits">http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits</a>

</div></pre>

      </div>

    </blockquote>

    <br>

  </body>

</html>