[PATCH] R600/SI: implement range reduction for sin/cos

Wed Jun 25 14:48:27 PDT 2014

On 06/25/2014 01:14 PM, Grigori Goronzy wrote:
> These instructions can only take a limited input range, and return
> the constant value 1 out of range. We should do range reduction to
> be able to process arbitrary values. Use a FRACT instruction after
> normalization to achieve this.
>
> v2: use DAG lowering instead of intrinsic, adapt test
> ---
>   lib/Target/R600/SIISelLowering.cpp | 28 ++++++++++++++++++++++++++++
>   lib/Target/R600/SIISelLowering.h   |  1 +
>   lib/Target/R600/SIInstructions.td  |  8 ++++----
>   test/CodeGen/R600/llvm.sin.ll      |  1 +
>   4 files changed, 34 insertions(+), 4 deletions(-)
>
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 29e4b98..b543816 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -80,6 +80,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
>     setOperationAction(ISD::SUBC, MVT::i32, Legal);
>     setOperationAction(ISD::SUBE, MVT::i32, Legal);
>   
> +  setOperationAction(ISD::FSIN, MVT::f32, Custom);
> +  setOperationAction(ISD::FCOS, MVT::f32, Custom);
> +
>     // We need to custom lower vector stores from local memory
>     setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
>     setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
> @@ -617,6 +620,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
>       }
>     }
>   
> +  case ISD::FSIN:
> +  case ISD::FCOS:
> +    return LowerTrig(Op, DAG);
>     case ISD::SELECT: return LowerSELECT(Op, DAG);
>     case ISD::STORE: return LowerSTORE(Op, DAG);
>     case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
> @@ -992,6 +998,28 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
>     return Chain;
>   }
>   
> +SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
> +  EVT VT = Op.getValueType();
> +  SDValue Arg = Op.getOperand(0);
> +  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
> +        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
> +          DAG.getConstantFP(0.15915494309, MVT::f32)));
Can you use the expression involving the fraction of pi instead of the 
fully computed constant?

> +  unsigned TrigNode;
> +
> +  switch (Op.getOpcode()) {
> +  case ISD::FCOS:
> +    TrigNode = AMDGPUISD::COS_HW;
> +    break;
> +  case ISD::FSIN:
> +    TrigNode = AMDGPUISD::SIN_HW;
> +    break;
> +  default:
> +    llvm_unreachable("Wrong trig opcode");
> +  }
> +
> +  return DAG.getNode(TrigNode, SDLoc(Op), VT, FractPart);
> +}
> +
>   //===----------------------------------------------------------------------===//
>   // Custom DAG optimizations
>   //===----------------------------------------------------------------------===//
> diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
> index 2f97a9a..10ba8f9 100644
> --- a/lib/Target/R600/SIISelLowering.h
> +++ b/lib/Target/R600/SIISelLowering.h
> @@ -28,6 +28,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
>     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
> +  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
>     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
>   
>     bool foldImm(SDValue &Operand, int32_t &Immediate,
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 0b12f60..7fc56a5 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -2253,13 +2253,13 @@ def : Pat<
>   >;
>   
>   def : Pat <
> -  (fcos f32:$src0),
> -  (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
> +  (COS_HW f32:$src0),
> +  (V_COS_F32_e32 $src0)
>   >;
>   
>   def : Pat <
> -  (fsin f32:$src0),
> -  (V_SIN_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
> +  (SIN_HW f32:$src0),
> +  (V_SIN_F32_e32 $src0)
>   >;
You should be able to move these patterns to the instruction definitions now

>   
>   def : Pat <
> diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
> index 41c363c..dafbdbc 100644
> --- a/test/CodeGen/R600/llvm.sin.ll
> +++ b/test/CodeGen/R600/llvm.sin.ll
> @@ -8,6 +8,7 @@
>   ;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
>   ;EG-NOT: SIN
>   ;SI: V_MUL_F32
> +;SI: V_FRACT_F32
>   ;SI: V_SIN_F32
>   ;SI-NOT: V_SIN_F32
>   
Another test that runs with -enable-unsafe-fp-math that tests folding 
the conversion with some other operations might be helpful, but not that 
important