[PATCH] R600: Add intrinsics for math helper instructions

Tue Jun 17 10:05:19 PDT 2014

On 06/17/2014 07:10 AM, Tom Stellard wrote:
> On Tue, Jun 17, 2014 at 12:28:14AM +0000, Matt Arsenault wrote:
>> These will be used in the implementations of custom lowering of and library implementations of various math functions, so it's useful to expose these as builtins.
>>
>> http://reviews.llvm.org/D4168
>>
>> Files:
>>    include/llvm/IR/IntrinsicsR600.td
>>    lib/Target/R600/AMDGPUISelLowering.cpp
>>    lib/Target/R600/AMDGPUISelLowering.h
>>    lib/Target/R600/AMDGPUInstrInfo.td
>>    lib/Target/R600/AMDGPUInstructions.td
>>    lib/Target/R600/AMDGPUIntrinsics.td
>>    lib/Target/R600/SIInsertWaits.cpp
>>    lib/Target/R600/SIInstructions.td
>>    lib/Transforms/InstCombine/InstCombineCalls.cpp
>>    test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
>>    test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
>>    test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
>>    test/CodeGen/R600/llvm.AMDGPU.rcp.ll
>>    test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
>>    test/Transforms/InstCombine/r600-intrinsics.ll
>> Index: include/llvm/IR/IntrinsicsR600.td
>> ===================================================================
>> --- include/llvm/IR/IntrinsicsR600.td
>> +++ include/llvm/IR/IntrinsicsR600.td
>> @@ -33,4 +33,34 @@
>>                                          "__builtin_r600_read_tgid">;
>>   defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
>>                                          "__builtin_r600_read_tidig">;
>> +
>>   } // End TargetPrefix = "r600"
>> +
>> +let TargetPrefix = "AMDGPU" in {
>> +def int_AMDGPU_div_scale :
>> +  Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty],
>> +            [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
>> +            GCCBuiltin<"__builtin_r600_div_scale">;
> I think we should replace the r600 in the builtin name with amdgpu, this will
> prevent some confusion about what hardware is supported on.
>
>
>> +
>> +def int_AMDGPU_div_fmas :
>> +  Intrinsic<[llvm_anyfloat_ty],
>> +            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
>> +            [IntrNoMem]>,
>> +            GCCBuiltin<"__builtin_r600_div_fmas">;
>> +
>> +def int_AMDGPU_div_fixup :
>> +  Intrinsic<[llvm_anyfloat_ty],
>> +            [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
>> +            GCCBuiltin<"__builtin_r600_div_fixup">;
>> +
>> +def int_AMDGPU_trig_preop :
>> +  Intrinsic<[llvm_anyfloat_ty],
>> +            [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>,
>> +            GCCBuiltin<"__builtin_r600_trig_preop">;
>> +
>> +def int_AMDGPU_rcp :
>> +  Intrinsic<[llvm_anyfloat_ty],
>> +            [LLVMMatchType<0>], [IntrNoMem]>,
>> +            GCCBuiltin<"__builtin_r600_rcp">;
>> +
>> +} // End TargetPrefix = "AMDGPU"
>> Index: lib/Target/R600/AMDGPUISelLowering.cpp
>> ===================================================================
>> --- lib/Target/R600/AMDGPUISelLowering.cpp
>> +++ lib/Target/R600/AMDGPUISelLowering.cpp
>> @@ -896,6 +896,25 @@
>>   
>>       case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
>>         return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
>> +
>> +    case Intrinsic::AMDGPU_div_scale:
>> +      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
>> +                         Op.getOperand(1), Op.getOperand(2));
>> +
>> +    case Intrinsic::AMDGPU_div_fmas:
>> +      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
>> +                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
>> +
>> +    case Intrinsic::AMDGPU_div_fixup:
>> +      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
>> +                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
>> +
>> +    case Intrinsic::AMDGPU_trig_preop:
>> +      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
>> +                         Op.getOperand(1), Op.getOperand(2));
>> +
>> +    case Intrinsic::AMDGPU_rcp:
>> +      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
>>     }
>>   }
>>   
>> @@ -1912,6 +1931,13 @@
>>     NODE_NAME_CASE(FMIN)
>>     NODE_NAME_CASE(SMIN)
>>     NODE_NAME_CASE(UMIN)
>> +  NODE_NAME_CASE(URECIP)
>> +  NODE_NAME_CASE(DIV_SCALE)
>> +  NODE_NAME_CASE(DIV_FMAS)
>> +  NODE_NAME_CASE(DIV_FIXUP)
>> +  NODE_NAME_CASE(TRIG_PREOP)
>> +  NODE_NAME_CASE(RCP)
>> +  NODE_NAME_CASE(DOT4)
>>     NODE_NAME_CASE(BFE_U32)
>>     NODE_NAME_CASE(BFE_I32)
>>     NODE_NAME_CASE(BFI)
>> @@ -1920,8 +1946,6 @@
>>     NODE_NAME_CASE(MUL_I24)
>>     NODE_NAME_CASE(MAD_U24)
>>     NODE_NAME_CASE(MAD_I24)
>> -  NODE_NAME_CASE(URECIP)
>> -  NODE_NAME_CASE(DOT4)
>>     NODE_NAME_CASE(EXPORT)
>>     NODE_NAME_CASE(CONST_ADDRESS)
>>     NODE_NAME_CASE(REGISTER_LOAD)
>> Index: lib/Target/R600/AMDGPUISelLowering.h
>> ===================================================================
>> --- lib/Target/R600/AMDGPUISelLowering.h
>> +++ lib/Target/R600/AMDGPUISelLowering.h
>> @@ -179,6 +179,11 @@
>>     SMIN,
>>     UMIN,
>>     URECIP,
>> +  DIV_SCALE,
>> +  DIV_FMAS,
>> +  DIV_FIXUP,
>> +  TRIG_PREOP,
>> +  RCP,
>>     DOT4,
>>     BFE_U32, // Extract range of bits with zero extension to 32-bits.
>>     BFE_I32, // Extract range of bits with sign extension to 32-bits.
>> Index: lib/Target/R600/AMDGPUInstrInfo.td
>> ===================================================================
>> --- lib/Target/R600/AMDGPUInstrInfo.td
>> +++ lib/Target/R600/AMDGPUInstrInfo.td
>> @@ -19,6 +19,14 @@
>>     SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
>>   ]>;
>>   
>> +def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
>> +  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
>> +>;
>> +
>> +def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
>> +  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>> +>;
>> +
>>   //===----------------------------------------------------------------------===//
>>   // AMDGPU DAG Nodes
>>   //
>> @@ -29,6 +37,9 @@
>>   // out = a - floor(a)
>>   def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
>>   
>> +// out = 1.0 / a
>> +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
>> +
>>   // out = max(a, b) a and b are floats
>>   def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
>>     [SDNPCommutative, SDNPAssociative]
>> @@ -78,6 +89,21 @@
>>   // e is rounding error
>>   def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
>>   
>> +// Special case divide preop and flags.
>> +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
>> +
>> +//  Special case divide FMA with scale and flags (src0 = Quotient,
>> +//  src1 = Denominator, src2 = Numerator).
>> +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
>> +
>> +// Single or double precision division fixup.
>> +// Special case divide fixup and flags(src0 = Quotient, src1 =
>> +// Denominator, src2 = Numerator).
>> +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
>> +
>> +// Look Up 2.0 / pi src0 with segment select src1[4:0]
>> +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
>> +
>>   def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
>>                             SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
>>                             [SDNPHasChain, SDNPMayLoad]>;
>> Index: lib/Target/R600/AMDGPUInstructions.td
>> ===================================================================
>> --- lib/Target/R600/AMDGPUInstructions.td
>> +++ lib/Target/R600/AMDGPUInstructions.td
>> @@ -519,6 +519,16 @@
>>     >;
>>   }
>>   
>> +class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
>> +  (fdiv FP_ONE, vt:$src),
>> +  (RcpInst $src)
>> +>;
>> +
>> +class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
>> +  (AMDGPUrcp (fsqrt vt:$src)),
>> +  (RsqInst $src)
>> +>;
>> +
> Do RCP and RSQ have IEEE precision?
I'm unclear on this. Different sources seem to conflict. One place I've 
found claims it has 1 ulp precision, other say it is "approximate" with 
no mention of IEEE. If it is there's not really any reason to add the 
intrinsic for it