[PATCH] R600: Add intrinsics for math helper instructions

Wed Jun 18 16:06:12 PDT 2014

On Wed, Jun 18, 2014 at 08:04:38PM +0000, Matt Arsenault wrote:
> Rename builtin prefix, add rsq
> 
> http://reviews.llvm.org/D4168
> 
> Files:
>   include/llvm/IR/IntrinsicsR600.td
>   lib/Target/R600/AMDGPUISelLowering.cpp
>   lib/Target/R600/AMDGPUISelLowering.h
>   lib/Target/R600/AMDGPUInstrInfo.td
>   lib/Target/R600/AMDGPUInstructions.td
>   lib/Target/R600/AMDGPUIntrinsics.td
>   lib/Target/R600/SIInsertWaits.cpp
>   lib/Target/R600/SIInstructions.td
>   lib/Transforms/InstCombine/InstCombineCalls.cpp
>   test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
>   test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
>   test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
>   test/CodeGen/R600/llvm.AMDGPU.rcp.ll
>   test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
>   test/Transforms/InstCombine/r600-intrinsics.ll

> Index: include/llvm/IR/IntrinsicsR600.td
> ===================================================================
> --- include/llvm/IR/IntrinsicsR600.td
> +++ include/llvm/IR/IntrinsicsR600.td
> @@ -33,4 +33,40 @@
>                                         "__builtin_r600_read_tgid">;
>  defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
>                                         "__builtin_r600_read_tidig">;
> +
>  } // End TargetPrefix = "r600"
> +
> +let TargetPrefix = "AMDGPU" in {
> +def int_AMDGPU_div_scale :
> +  Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty],
> +            [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
> +            GCCBuiltin<"__builtin_amdgpu_div_scale">;
> +
> +def int_AMDGPU_div_fmas :
> +  Intrinsic<[llvm_anyfloat_ty],
> +            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
> +            [IntrNoMem]>,
> +            GCCBuiltin<"__builtin_amdgpu_div_fmas">;
> +
> +def int_AMDGPU_div_fixup :
> +  Intrinsic<[llvm_anyfloat_ty],
> +            [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
> +            GCCBuiltin<"__builtin_amdgpu_div_fixup">;
> +
> +def int_AMDGPU_trig_preop :
> +  Intrinsic<[llvm_anyfloat_ty],
> +            [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>,
> +            GCCBuiltin<"__builtin_amdgpu_trig_preop">;
> +
> +def int_AMDGPU_rcp :
> +  Intrinsic<[llvm_anyfloat_ty],
> +            [LLVMMatchType<0>], [IntrNoMem]>,
> +            GCCBuiltin<"__builtin_amdgpu_rcp">;
> +
> +def int_AMDGPU_rsq :
> +  Intrinsic<[llvm_anyfloat_ty],
> +            [LLVMMatchType<0>], [IntrNoMem]>,
> +            GCCBuiltin<"__builtin_amdgpu_rsq">;
> +
> +
> +} // End TargetPrefix = "AMDGPU"
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -909,6 +909,28 @@
>  
>      case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
>        return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
> +
> +    case Intrinsic::AMDGPU_div_scale:
> +      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
> +                         Op.getOperand(1), Op.getOperand(2));
> +
> +    case Intrinsic::AMDGPU_div_fmas:
> +      return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
> +                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
> +
> +    case Intrinsic::AMDGPU_div_fixup:
> +      return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
> +                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
> +
> +    case Intrinsic::AMDGPU_trig_preop:
> +      return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
> +                         Op.getOperand(1), Op.getOperand(2));
> +
> +    case Intrinsic::AMDGPU_rcp:
> +      return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
> +
> +    case Intrinsic::AMDGPU_rsq:
> +      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
>    }
>  }
>  
> @@ -2029,6 +2051,14 @@
>    NODE_NAME_CASE(FMIN)
>    NODE_NAME_CASE(SMIN)
>    NODE_NAME_CASE(UMIN)
> +  NODE_NAME_CASE(URECIP)
> +  NODE_NAME_CASE(DIV_SCALE)
> +  NODE_NAME_CASE(DIV_FMAS)
> +  NODE_NAME_CASE(DIV_FIXUP)
> +  NODE_NAME_CASE(TRIG_PREOP)
> +  NODE_NAME_CASE(RCP)
> +  NODE_NAME_CASE(RSQ)
> +  NODE_NAME_CASE(DOT4)
>    NODE_NAME_CASE(BFE_U32)
>    NODE_NAME_CASE(BFE_I32)
>    NODE_NAME_CASE(BFI)
> @@ -2038,8 +2068,6 @@
>    NODE_NAME_CASE(MUL_I24)
>    NODE_NAME_CASE(MAD_U24)
>    NODE_NAME_CASE(MAD_I24)
> -  NODE_NAME_CASE(URECIP)
> -  NODE_NAME_CASE(DOT4)
>    NODE_NAME_CASE(EXPORT)
>    NODE_NAME_CASE(CONST_ADDRESS)
>    NODE_NAME_CASE(REGISTER_LOAD)
> Index: lib/Target/R600/AMDGPUISelLowering.h
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.h
> +++ lib/Target/R600/AMDGPUISelLowering.h
> @@ -174,6 +174,9 @@
>    DWORDADDR,
>    FRACT,
>    CLAMP,
> +
> +  // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
> +  // Denormals handled on some parts.
>    COS_HW,
>    SIN_HW,
>    FMAX,
> @@ -183,6 +186,15 @@
>    SMIN,
>    UMIN,
>    URECIP,
> +  DIV_SCALE,
> +  DIV_FMAS,
> +  DIV_FIXUP,
> +  TRIG_PREOP, // 1 ULP max error for f64
> +
> +  // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
> +  //            For f64, max error 2^29 ULP, handles denormals.
> +  RCP,
> +  RSQ,
>    DOT4,
>    BFE_U32, // Extract range of bits with zero extension to 32-bits.
>    BFE_I32, // Extract range of bits with sign extension to 32-bits.
> Index: lib/Target/R600/AMDGPUInstrInfo.td
> ===================================================================
> --- lib/Target/R600/AMDGPUInstrInfo.td
> +++ lib/Target/R600/AMDGPUInstrInfo.td
> @@ -19,6 +19,14 @@
>    SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
>  ]>;
>  
> +def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
> +  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
> +>;
> +
> +def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
> +  [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
> +>;
> +
>  //===----------------------------------------------------------------------===//
>  // AMDGPU DAG Nodes
>  //
> @@ -29,6 +37,9 @@
>  // out = a - floor(a)
>  def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
>  
> +// out = 1.0 / a
> +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
> +
>  // out = max(a, b) a and b are floats
>  def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
>    [SDNPCommutative, SDNPAssociative]
> @@ -78,6 +89,21 @@
>  // e is rounding error
>  def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
>  
> +// Special case divide preop and flags.
> +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
> +
> +//  Special case divide FMA with scale and flags (src0 = Quotient,
> +//  src1 = Denominator, src2 = Numerator).
> +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
> +
> +// Single or double precision division fixup.
> +// Special case divide fixup and flags(src0 = Quotient, src1 =
> +// Denominator, src2 = Numerator).
> +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
> +
> +// Look Up 2.0 / pi src0 with segment select src1[4:0]
> +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
> +
>  def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
>                            SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
>                            [SDNPHasChain, SDNPMayLoad]>;
> Index: lib/Target/R600/AMDGPUInstructions.td
> ===================================================================
> --- lib/Target/R600/AMDGPUInstructions.td
> +++ lib/Target/R600/AMDGPUInstructions.td
> @@ -519,6 +519,16 @@
>    >;
>  }
>  
> +class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
> +  (fdiv FP_ONE, vt:$src),
> +  (RcpInst $src)
> +>;
> +
> +class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
> +  (AMDGPUrcp (fsqrt vt:$src)),
> +  (RsqInst $src)
> +>;
> +
>  include "R600Instructions.td"
>  include "R700Instructions.td"
>  include "EvergreenInstructions.td"
> Index: lib/Target/R600/AMDGPUIntrinsics.td
> ===================================================================
> --- lib/Target/R600/AMDGPUIntrinsics.td
> +++ lib/Target/R600/AMDGPUIntrinsics.td
> @@ -30,7 +30,6 @@
>    def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
>    def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
>    def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
> -  def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
>    def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
>    def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
>    def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
> Index: lib/Target/R600/SIInsertWaits.cpp
> ===================================================================
> --- lib/Target/R600/SIInsertWaits.cpp
> +++ lib/Target/R600/SIInsertWaits.cpp
> @@ -341,6 +341,8 @@
>    return Result;
>  }
>  
> +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
> +// around other non-memory instructions.
>  bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
>    bool Changes = false;
>  
> Index: lib/Target/R600/SIInstructions.td
> ===================================================================
> --- lib/Target/R600/SIInstructions.td
> +++ lib/Target/R600/SIInstructions.td
> @@ -1116,10 +1116,11 @@
>  defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
>    [(set f32:$dst, (flog2 f32:$src0))]
>  >;
> +
>  defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
>  defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
>  defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
> -  [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
> +  [(set f32:$dst, (AMDGPUrcp f32:$src0))]
>  >;
>  defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
>  defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
> @@ -1131,7 +1132,7 @@
>    [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))]
>  >;
>  defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
> -  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
> +  [(set f64:$dst, (AMDGPUrcp f64:$src0))]
>  >;
>  defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
>  defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
> @@ -1417,8 +1418,12 @@
>  //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
>  defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
>  ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
> -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
> -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
> +defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32",
> +  [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))]
> +>;
> +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64",
> +  [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))]
> +>;
>  
>  def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
>    [(set i64:$dst, (shl i64:$src0, i32:$src1))]
> @@ -1452,12 +1457,19 @@
>  
>  defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
>  def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
> -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
> -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
> +
> +defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
> +  [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
> +>;
> +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64",
> +  [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))]
> +>;
>  //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
>  //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
>  //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
> -def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
> +def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64",
> +  [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))]
> +>;
>  
>  //===----------------------------------------------------------------------===//
>  // Pseudo Instructions
> @@ -2722,6 +2734,11 @@
>      (S_MOV_B32 0), sub1)
>  >;
>  
> +def : RcpPat<V_RCP_F32_e32, f32>;
> +def : RcpPat<V_RCP_F64_e32, f64>;
> +def : RsqPat<V_RSQ_F32_e32, f32>;
> +def : RsqPat<V_RSQ_F64_e32, f64>;
> +

I've been trying to group the patterns by instruction type.  Could you
add a VOP1 Pattern section before the VOP2 Patterns and move these
there.  With that change this patch LGTM.

-Tom

>  //============================================================================//
>  // Miscellaneous Optimization Patterns
>  //============================================================================//
> Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
> ===================================================================
> --- lib/Transforms/InstCombine/InstCombineCalls.cpp
> +++ lib/Transforms/InstCombine/InstCombineCalls.cpp
> @@ -922,6 +922,20 @@
>      break;
>    }
>  
> +  case Intrinsic::AMDGPU_rcp: {
> +    if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) {
> +      const APFloat &ArgVal = C->getValueAPF();
> +      APFloat Val(ArgVal.getSemantics(), 1.0);
> +      APFloat::opStatus Status = Val.divide(ArgVal,
> +                                            APFloat::rmNearestTiesToEven);
> +      // Only do this if it was exact and therefore not dependent on the
> +      // rounding mode.
> +      if (Status == APFloat::opOK)
> +        return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
> +    }
> +
> +    break;
> +  }
>    case Intrinsic::stackrestore: {
>      // If the save is right next to the restore, remove the restore.  This can
>      // happen when variable allocas are DCE'd.
> Index: test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
> @@ -0,0 +1,27 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
> +declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
> +
> +; SI-LABEL: @test_div_fixup_f32:
> +; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
> +; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
> +; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
> +; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
> +; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
> +; SI: V_DIV_FIXUP_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
> +  %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
> +  store float %result, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_div_fixup_f64:
> +; SI: V_DIV_FIXUP_F64
> +define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
> +  %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
> +  store double %result, double addrspace(1)* %out, align 8
> +  ret void
> +}
> Index: test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
> @@ -0,0 +1,27 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +declare float @llvm.AMDGPU.div.fmas.f32(float, float, float) nounwind readnone
> +declare double @llvm.AMDGPU.div.fmas.f64(double, double, double) nounwind readnone
> +
> +; SI-LABEL: @test_div_fmas_f32:
> +; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
> +; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
> +; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
> +; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
> +; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
> +; SI: V_DIV_FMAS_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
> +; SI: BUFFER_STORE_DWORD [[RESULT]],
> +; SI: S_ENDPGM
> +define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
> +  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c) nounwind readnone
> +  store float %result, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL: @test_div_fmas_f64:
> +; SI: V_DIV_FMAS_F64
> +define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
> +  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c) nounwind readnone
> +  store double %result, double addrspace(1)* %out, align 8
> +  ret void
> +}
> Index: test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
> @@ -0,0 +1,23 @@
> +; XFAIL: *
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +declare float @llvm.AMDGPU.div.scale.f32(float, float) nounwind readnone
> +declare double @llvm.AMDGPU.div.scale.f64(double, double) nounwind readnone
> +
> +; SI-LABEL @test_div_scale_f32:
> +define void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
> +  %a = load float addrspace(1)* %aptr, align 4
> +  %b = load float addrspace(1)* %bptr, align 4
> +  %result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b) nounwind readnone
> +  store float %result, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; SI-LABEL @test_div_scale_f64:
> +define void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr) nounwind {
> +  %a = load double addrspace(1)* %aptr, align 8
> +  %b = load double addrspace(1)* %bptr, align 8
> +  %result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b) nounwind readnone
> +  store double %result, double addrspace(1)* %out, align 8
> +  ret void
> +}
> Index: test/CodeGen/R600/llvm.AMDGPU.rcp.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/llvm.AMDGPU.rcp.ll
> @@ -0,0 +1,58 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
> +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
> +
> +
> +declare float @llvm.sqrt.f32(float) nounwind readnone
> +declare double @llvm.sqrt.f64(double) nounwind readnone
> +
> +; FUNC-LABEL: @rcp_f32
> +; SI: V_RCP_F32_e32
> +define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
> +  %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
> +  store float %rcp, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @rcp_f64
> +; SI: V_RCP_F64_e32
> +define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
> +  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
> +  store double %rcp, double addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @rcp_pat_f32
> +; SI: V_RCP_F32_e32
> +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
> +  %rcp = fdiv float 1.0, %src
> +  store float %rcp, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @rcp_pat_f64
> +; SI: V_RCP_F64_e32
> +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
> +  %rcp = fdiv double 1.0, %src
> +  store double %rcp, double addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; FUNC-LABEL: @rsq_rcp_pat_f32
> +; SI: V_RSQ_F32_e32
> +define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
> +  %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
> +  %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
> +  store float %rcp, float addrspace(1)* %out, align 4
> +  ret void
> +}
> +
> +; FUNC-LABEL: @rsq_rcp_pat_f64
> +; SI: V_RSQ_F64_e32
> +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
> +  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
> +  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
> +  store double %rcp, double addrspace(1)* %out, align 8
> +  ret void
> +}
> Index: test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
> @@ -0,0 +1,29 @@
> +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
> +
> +declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
> +
> +; SI-LABEL: @test_trig_preop_f64:
> +; SI-DAG: BUFFER_LOAD_DWORD [[SEG:v[0-9]+]]
> +; SI-DAG: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
> +; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
> +; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
> +; SI: S_ENDPGM
> +define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
> +  %a = load double addrspace(1)* %aptr, align 8
> +  %b = load i32 addrspace(1)* %bptr, align 4
> +  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
> +  store double %result, double addrspace(1)* %out, align 8
> +  ret void
> +}
> +
> +; SI-LABEL: @test_trig_preop_f64_imm_segment:
> +; SI: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
> +; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
> +; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
> +; SI: S_ENDPGM
> +define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
> +  %a = load double addrspace(1)* %aptr, align 8
> +  %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
> +  store double %result, double addrspace(1)* %out, align 8
> +  ret void
> +}
> Index: test/Transforms/InstCombine/r600-intrinsics.ll
> ===================================================================
> --- /dev/null
> +++ test/Transforms/InstCombine/r600-intrinsics.ll
> @@ -0,0 +1,47 @@
> +; RUN: opt -instcombine -S < %s | FileCheck %s
> +
> +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
> +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
> +
> +; CHECK-LABEL: @test_constant_fold_rcp_f32_1
> +; CHECK-NEXT: ret float 1.000000e+00
> +define float @test_constant_fold_rcp_f32_1() nounwind {
> +  %val = call float @llvm.AMDGPU.rcp.f32(float 1.0) nounwind readnone
> +  ret float %val
> +}
> +
> +; CHECK-LABEL: @test_constant_fold_rcp_f64_1
> +; CHECK-NEXT:  ret double 1.000000e+00
> +define double @test_constant_fold_rcp_f64_1() nounwind {
> +  %val = call double @llvm.AMDGPU.rcp.f64(double 1.0) nounwind readnone
> +  ret double %val
> +}
> +
> +; CHECK-LABEL: @test_constant_fold_rcp_f32_half
> +; CHECK-NEXT: ret float 2.000000e+00
> +define float @test_constant_fold_rcp_f32_half() nounwind {
> +  %val = call float @llvm.AMDGPU.rcp.f32(float 0.5) nounwind readnone
> +  ret float %val
> +}
> +
> +; CHECK-LABEL: @test_constant_fold_rcp_f64_half
> +; CHECK-NEXT:  ret double 2.000000e+00
> +define double @test_constant_fold_rcp_f64_half() nounwind {
> +  %val = call double @llvm.AMDGPU.rcp.f64(double 0.5) nounwind readnone
> +  ret double %val
> +}
> +
> +; CHECK-LABEL: @test_constant_fold_rcp_f32_43
> +; CHECK-NEXT: call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01)
> +define float @test_constant_fold_rcp_f32_43() nounwind {
> + %val = call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01) nounwind readnone
> + ret float %val
> +}
> +
> +; CHECK-LABEL: @test_constant_fold_rcp_f64_43
> +; CHECK-NEXT: call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01)
> +define double @test_constant_fold_rcp_f64_43() nounwind {
> +  %val = call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01) nounwind readnone
> +  ret double %val
> +}
> +

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits