[PATCH] R600: Add intrinsics for math helper instructions

Tue Jun 17 10:22:06 PDT 2014

On Tue, Jun 17, 2014 at 10:05:19AM -0700, Matt Arsenault wrote:
> On 06/17/2014 07:10 AM, Tom Stellard wrote:
> >On Tue, Jun 17, 2014 at 12:28:14AM +0000, Matt Arsenault wrote:
> >>These will be used in the implementations of custom lowering of and library implementations of various math functions, so it's useful to expose these as builtins.
> >>
> >>http://reviews.llvm.org/D4168
> >>
> >>Files:
> >>   include/llvm/IR/IntrinsicsR600.td
> >>   lib/Target/R600/AMDGPUISelLowering.cpp
> >>   lib/Target/R600/AMDGPUISelLowering.h
> >>   lib/Target/R600/AMDGPUInstrInfo.td
> >>   lib/Target/R600/AMDGPUInstructions.td
> >>   lib/Target/R600/AMDGPUIntrinsics.td
> >>   lib/Target/R600/SIInsertWaits.cpp
> >>   lib/Target/R600/SIInstructions.td
> >>   lib/Transforms/InstCombine/InstCombineCalls.cpp
> >>   test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
> >>   test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
> >>   test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
> >>   test/CodeGen/R600/llvm.AMDGPU.rcp.ll
> >>   test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
> >>   test/Transforms/InstCombine/r600-intrinsics.ll
> >>Index: include/llvm/IR/IntrinsicsR600.td
> >>===================================================================
> >>--- include/llvm/IR/IntrinsicsR600.td
> >>+++ include/llvm/IR/IntrinsicsR600.td
> >>@@ -33,4 +33,34 @@
> >>                                         "__builtin_r600_read_tgid">;
> >>  defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
> >>                                         "__builtin_r600_read_tidig">;
> >>+
> >>  } // End TargetPrefix = "r600"
> >>+
> >>+let TargetPrefix = "AMDGPU" in {
> >>+def int_AMDGPU_div_scale :
> >>+  Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty],
> >>+            [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
> >>+            GCCBuiltin<"__builtin_r600_div_scale">;
> >I think we should replace the r600 in the builtin name with amdgpu, this will
> >prevent some confusion about what hardware is supported on.
> >
> >
> >>+
> >>+def int_AMDGPU_div_fmas :
> >>+  Intrinsic<[llvm_anyfloat_ty],
> >>+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
> >>+            [IntrNoMem]>,
> >>+            GCCBuiltin<"__builtin_r600_div_fmas">;
> >>+
> >>+def int_AMDGPU_div_fixup :
> >>+  Intrinsic<[llvm_anyfloat_ty],
> >>+            [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
> >>+            GCCBuiltin<"__builtin_r600_div_fixup">;
> >>+
> >>+def int_AMDGPU_trig_preop :
> >>+  Intrinsic<[llvm_anyfloat_ty],
> >>+            [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>,
> >>+            GCCBuiltin<"__builtin_r600_trig_preop">;
> >>+
> >>+def int_AMDGPU_rcp :
> >>+  Intrinsic<[llvm_anyfloat_ty],
> >>+            [LLVMMatchType<0>], [IntrNoMem]>,
> >>+            GCCBuiltin<"__builtin_r600_rcp">;
> >>+
> >>+} // End TargetPrefix = "AMDGPU"
> >>Index: lib/Target/R600/AMDGPUISelLowering.cpp
> >>===================================================================
> >>--- lib/Target/R600/AMDGPUISelLowering.cpp
> >>+++ lib/Target/R600/AMDGPUISelLowering.cpp
> >>@@ -896,6 +896,25 @@
> >>      case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
> >>        return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
> >>+
> >>+    case Intrinsic::AMDGPU_div_scale:
> >>+      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
> >>+                         Op.getOperand(1), Op.getOperand(2));
> >>+
> >>+    case Intrinsic::AMDGPU_div_fmas:
> >>+      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
> >>+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
> >>+
> >>+    case Intrinsic::AMDGPU_div_fixup:
> >>+      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
> >>+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
> >>+
> >>+    case Intrinsic::AMDGPU_trig_preop:
> >>+      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
> >>+                         Op.getOperand(1), Op.getOperand(2));
> >>+
> >>+    case Intrinsic::AMDGPU_rcp:
> >>+      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
> >>    }
> >>  }
> >>@@ -1912,6 +1931,13 @@
> >>    NODE_NAME_CASE(FMIN)
> >>    NODE_NAME_CASE(SMIN)
> >>    NODE_NAME_CASE(UMIN)
> >>+  NODE_NAME_CASE(URECIP)
> >>+  NODE_NAME_CASE(DIV_SCALE)
> >>+  NODE_NAME_CASE(DIV_FMAS)
> >>+  NODE_NAME_CASE(DIV_FIXUP)
> >>+  NODE_NAME_CASE(TRIG_PREOP)
> >>+  NODE_NAME_CASE(RCP)
> >>+  NODE_NAME_CASE(DOT4)
> >>    NODE_NAME_CASE(BFE_U32)
> >>    NODE_NAME_CASE(BFE_I32)
> >>    NODE_NAME_CASE(BFI)
> >>@@ -1920,8 +1946,6 @@
> >>    NODE_NAME_CASE(MUL_I24)
> >>    NODE_NAME_CASE(MAD_U24)
> >>    NODE_NAME_CASE(MAD_I24)
> >>-  NODE_NAME_CASE(URECIP)
> >>-  NODE_NAME_CASE(DOT4)
> >>    NODE_NAME_CASE(EXPORT)
> >>    NODE_NAME_CASE(CONST_ADDRESS)
> >>    NODE_NAME_CASE(REGISTER_LOAD)
> >>Index: lib/Target/R600/AMDGPUISelLowering.h
> >>===================================================================
> >>--- lib/Target/R600/AMDGPUISelLowering.h
> >>+++ lib/Target/R600/AMDGPUISelLowering.h
> >>@@ -179,6 +179,11 @@
> >>    SMIN,
> >>    UMIN,
> >>    URECIP,
> >>+  DIV_SCALE,
> >>+  DIV_FMAS,
> >>+  DIV_FIXUP,
> >>+  TRIG_PREOP,
> >>+  RCP,
> >>    DOT4,
> >>    BFE_U32, // Extract range of bits with zero extension to 32-bits.
> >>    BFE_I32, // Extract range of bits with sign extension to 32-bits.
> >>Index: lib/Target/R600/AMDGPUInstrInfo.td
> >>===================================================================
> >>--- lib/Target/R600/AMDGPUInstrInfo.td
> >>+++ lib/Target/R600/AMDGPUInstrInfo.td
> >>@@ -19,6 +19,14 @@
> >>    SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
> >>  ]>;
> >>+def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
> >>+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
> >>+>;
> >>+
> >>+def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
> >>+  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
> >>+>;
> >>+
> >>  //===----------------------------------------------------------------------===//
> >>  // AMDGPU DAG Nodes
> >>  //
> >>@@ -29,6 +37,9 @@
> >>  // out = a - floor(a)
> >>  def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
> >>+// out = 1.0 / a
> >>+def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
> >>+
> >>  // out = max(a, b) a and b are floats
> >>  def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
> >>    [SDNPCommutative, SDNPAssociative]
> >>@@ -78,6 +89,21 @@
> >>  // e is rounding error
> >>  def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
> >>+// Special case divide preop and flags.
> >>+def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
> >>+
> >>+//  Special case divide FMA with scale and flags (src0 = Quotient,
> >>+//  src1 = Denominator, src2 = Numerator).
> >>+def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
> >>+
> >>+// Single or double precision division fixup.
> >>+// Special case divide fixup and flags(src0 = Quotient, src1 =
> >>+// Denominator, src2 = Numerator).
> >>+def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
> >>+
> >>+// Look Up 2.0 / pi src0 with segment select src1[4:0]
> >>+def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
> >>+
> >>  def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
> >>                            SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
> >>                            [SDNPHasChain, SDNPMayLoad]>;
> >>Index: lib/Target/R600/AMDGPUInstructions.td
> >>===================================================================
> >>--- lib/Target/R600/AMDGPUInstructions.td
> >>+++ lib/Target/R600/AMDGPUInstructions.td
> >>@@ -519,6 +519,16 @@
> >>    >;
> >>  }
> >>+class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
> >>+  (fdiv FP_ONE, vt:$src),
> >>+  (RcpInst $src)
> >>+>;
> >>+
> >>+class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
> >>+  (AMDGPUrcp (fsqrt vt:$src)),
> >>+  (RsqInst $src)
> >>+>;
> >>+
> >Do RCP and RSQ have IEEE precision?
> I'm unclear on this. Different sources seem to conflict. One place
> I've found claims it has 1 ulp precision, other say it is
> "approximate" with no mention of IEEE. If it is there's not really
> any reason to add the intrinsic for it
> 

I think we should add comments defining the precisions of the intrinsic
and the SDNode and if we define it as approximate we should call it
something else, like rcp_approx or rcp_native.

-Tom