[PATCH] R600/SI: Custom lower fround

Tom Stellard tom at stellard.net
Wed Jan 21 07:03:18 PST 2015


On Wed, Jan 21, 2015 at 03:21:25AM +0000, Matt Arsenault wrote:
> This fixes it for SI. It also removes the pattern
> used previously for Evergreen for f32. I'm not sure
> if the the new R600 output is better or not, but it uses
> 1 fewer instructions if BFI is available.
> 

LGTM.

> http://reviews.llvm.org/D7079
> 
> Files:
>   lib/Target/R600/AMDGPUISelLowering.cpp
>   lib/Target/R600/AMDGPUISelLowering.h
>   lib/Target/R600/EvergreenInstructions.td
>   lib/Target/R600/R600Instructions.td
>   test/CodeGen/R600/llvm.round.f64.ll
>   test/CodeGen/R600/llvm.round.ll
> 
> EMAIL PREFERENCES
>   http://reviews.llvm.org/settings/panel/emailpreferences/

> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -127,9 +127,11 @@
>    setOperationAction(ISD::FABS,   MVT::f32, Legal);
>    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
>    setOperationAction(ISD::FRINT,  MVT::f32, Legal);
> -  setOperationAction(ISD::FROUND, MVT::f32, Legal);
>    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
>  
> +  setOperationAction(ISD::FROUND, MVT::f32, Custom);
> +  setOperationAction(ISD::FROUND, MVT::f64, Custom);
> +
>    setOperationAction(ISD::FREM, MVT::f32, Custom);
>    setOperationAction(ISD::FREM, MVT::f64, Custom);
>  
> @@ -610,6 +612,7 @@
>    case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
>    case ISD::FRINT: return LowerFRINT(Op, DAG);
>    case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
> +  case ISD::FROUND: return LowerFROUND(Op, DAG);
>    case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
>    case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
>    case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
> @@ -1917,6 +1920,20 @@
>    return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
>  }
>  
> +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
> +  const unsigned FractBits = 52;
> +  const unsigned ExpBits = 11;
> +
> +  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
> +                                Hi,
> +                                DAG.getConstant(FractBits - 32, MVT::i32),
> +                                DAG.getConstant(ExpBits, MVT::i32));
> +  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
> +                            DAG.getConstant(1023, MVT::i32));
> +
> +  return Exp;
> +}
> +
>  SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
>    SDLoc SL(Op);
>    SDValue Src = Op.getOperand(0);
> @@ -1932,16 +1949,9 @@
>    // exponent.
>    SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
>  
> -  const unsigned FractBits = 52;
> -  const unsigned ExpBits = 11;
> +  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
>  
> -  // Extract the exponent.
> -  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
> -                                Hi,
> -                                DAG.getConstant(FractBits - 32, MVT::i32),
> -                                DAG.getConstant(ExpBits, MVT::i32));
> -  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
> -                            DAG.getConstant(1023, MVT::i32));
> +  const unsigned FractBits = 52;
>  
>    // Extract the sign bit.
>    const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
> @@ -2004,6 +2014,99 @@
>    return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
>  }
>  
> +// XXX - May require not supporting f32 denormals?
> +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
> +  SDLoc SL(Op);
> +  SDValue X = Op.getOperand(0);
> +
> +  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
> +
> +  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
> +
> +  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
> +
> +  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
> +  const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
> +  const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
> +
> +  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
> +
> +  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
> +
> +  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
> +
> +  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
> +
> +  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
> +}
> +
> +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
> +  SDLoc SL(Op);
> +  SDValue X = Op.getOperand(0);
> +
> +  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
> +
> +  const SDValue Zero = DAG.getConstant(0, MVT::i32);
> +  const SDValue One = DAG.getConstant(1, MVT::i32);
> +  const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
> +  const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
> +  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
> +
> +
> +  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
> +
> +  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
> +
> +  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
> +
> +  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
> +
> +  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
> +  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
> +                          DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
> +                          Exp);
> +
> +  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
> +  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
> +                              DAG.getConstant(0, MVT::i64), Tmp0,
> +                              ISD::SETNE);
> +
> +  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
> +                             D, DAG.getConstant(0, MVT::i64));
> +  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
> +
> +  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
> +  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
> +
> +  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
> +  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
> +  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
> +
> +  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
> +                            ExpEqNegOne,
> +                            DAG.getConstantFP(1.0, MVT::f64),
> +                            DAG.getConstantFP(0.0, MVT::f64));
> +
> +  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
> +
> +  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
> +  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
> +
> +  return K;
> +}
> +
> +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
> +  EVT VT = Op.getValueType();
> +
> +  if (VT == MVT::f32)
> +    return LowerFROUND32(Op, DAG);
> +
> +  if (VT == MVT::f64)
> +    return LowerFROUND64(Op, DAG);
> +
> +  llvm_unreachable("unhandled type");
> +}
> +
>  SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
>    SDLoc SL(Op);
>    SDValue Src = Op.getOperand(0);
> Index: lib/Target/R600/AMDGPUISelLowering.h
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.h
> +++ lib/Target/R600/AMDGPUISelLowering.h
> @@ -49,6 +49,10 @@
>    SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
>    SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
>    SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
> +
> +  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
> +  SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
> +  SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
>    SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
>  
>    SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
> Index: lib/Target/R600/EvergreenInstructions.td
> ===================================================================
> --- lib/Target/R600/EvergreenInstructions.td
> +++ lib/Target/R600/EvergreenInstructions.td
> @@ -590,8 +590,6 @@
>  // SHA-256 Patterns
>  def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
>  
> -def : FROUNDPat <CNDGE_eg, CNDGT_eg>;
> -
>  def EG_ExportSwz : ExportSwzInst {
>    let Word1{19-16} = 0; // BURST_COUNT
>    let Word1{20} = 0; // VALID_PIXEL_MODE
> Index: lib/Target/R600/R600Instructions.td
> ===================================================================
> --- lib/Target/R600/R600Instructions.td
> +++ lib/Target/R600/R600Instructions.td
> @@ -1142,16 +1142,6 @@
>    (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
>  >;
>  
> -// FROUND pattern
> -class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat <
> -  (AMDGPUround f32:$x),
> -  (CNDGE $x,
> -  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)),
> -  (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
> -  )
> ->;
> -
> -
>  //===----------------------------------------------------------------------===//
>  // R600 / R700 Instructions
>  //===----------------------------------------------------------------------===//
> @@ -1195,8 +1185,6 @@
>    def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
>    defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
>  
> -  def : FROUNDPat <CNDGE_r600, CNDGT_r600>;
> -
>    def R600_ExportSwz : ExportSwzInst {
>      let Word1{20-17} = 0; // BURST_COUNT
>      let Word1{21} = eop;
> Index: test/CodeGen/R600/llvm.round.f64.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/llvm.round.f64.ll
> @@ -0,0 +1,74 @@
> +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +
> +; FUNC-LABEL: {{^}}round_f64:
> +; SI: s_endpgm
> +define void @round_f64(double addrspace(1)* %out, double %x) #0 {
> +  %result = call double @llvm.round.f64(double %x) #1
> +  store double %result, double addrspace(1)* %out
> +  ret void
> +}
> +
> +; This is a pretty large function, so just test a few of the
> +; instructions that are necessary.
> +
> +; FUNC-LABEL: {{^}}v_round_f64:
> +; SI: buffer_load_dwordx2
> +; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
> +
> +; SI: v_not_b32_e32
> +; SI: v_not_b32_e32
> +
> +; SI: v_cmp_eq_i32
> +
> +; SI: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
> +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
> +
> +; SI: v_cmp_lt_i32_e64
> +; SI: v_cmp_gt_i32_e64
> +
> +
> +; SI: buffer_store_dwordx2
> +; SI: s_endpgm
> +define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
> +  %tid = call i32 @llvm.r600.read.tidig.x() #1
> +  %gep = getelementptr double addrspace(1)* %in, i32 %tid
> +  %out.gep = getelementptr double addrspace(1)* %out, i32 %tid
> +  %x = load double addrspace(1)* %gep
> +  %result = call double @llvm.round.f64(double %x) #1
> +  store double %result, double addrspace(1)* %out.gep
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}round_v2f64:
> +; SI: s_endpgm
> +define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
> +  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
> +  store <2 x double> %result, <2 x double> addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}round_v4f64:
> +; SI: s_endpgm
> +define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
> +  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
> +  store <4 x double> %result, <4 x double> addrspace(1)* %out
> +  ret void
> +}
> +
> +; FUNC-LABEL: {{^}}round_v8f64:
> +; SI: s_endpgm
> +define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
> +  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
> +  store <8 x double> %result, <8 x double> addrspace(1)* %out
> +  ret void
> +}
> +
> +declare i32 @llvm.r600.read.tidig.x() #1
> +
> +declare double @llvm.round.f64(double) #1
> +declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
> +declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
> +declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
> +
> +attributes #0 = { nounwind }
> +attributes #1 = { nounwind readnone }
> Index: test/CodeGen/R600/llvm.round.ll
> ===================================================================
> --- test/CodeGen/R600/llvm.round.ll
> +++ test/CodeGen/R600/llvm.round.ll
> @@ -1,43 +1,66 @@
> -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
> -
> -; FUNC-LABEL: {{^}}f32:
> -; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
> -; R600-DAG: ADD  {{.*}}, -0.5
> -; R600-DAG: CEIL {{.*}} [[ARG]]
> -; R600-DAG: FLOOR {{.*}} [[ARG]]
> -; R600-DAG: CNDGE
> -; R600-DAG: CNDGT
> -; R600: CNDGE {{[^,]+}}, [[ARG]]
> -define void @f32(float addrspace(1)* %out, float %in) {
> -entry:
> -  %0 = call float @llvm.round.f32(float %in)
> -  store float %0, float addrspace(1)* %out
> +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
> +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
> +
> +; FUNC-LABEL: {{^}}round_f32:
> +; SI-DAG: s_load_dword [[SX:s[0-9]+]]
> +; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
> +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
> +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
> +; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
> +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
> +; SI: v_cmp_ge_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SUB]]|, 0.5
> +; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
> +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
> +; SI: buffer_store_dword [[RESULT]]
> +
> +; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
> +; R600-DAG: ADD  {{.*}},
> +; R600-DAG: BFI_INT
> +; R600-DAG: SETGE
> +; R600-DAG: CNDE
> +; R600-DAG: ADD
> +define void @round_f32(float addrspace(1)* %out, float %x) #0 {
> +  %result = call float @llvm.round.f32(float %x) #1
> +  store float %result, float addrspace(1)* %out
>    ret void
>  }
>  
>  ; The vector tests are really difficult to verify, since it can be hard to
>  ; predict how the scheduler will order the instructions.  We already have
>  ; a test for the scalar case, so the vector tests just check that the
>  ; compiler doesn't crash.
>  
> -; FUNC-LABEL: v2f32
> +; FUNC-LABEL: {{^}}round_v2f32:
> +; SI: s_endpgm
>  ; R600: CF_END
> -define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
> -entry:
> -  %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in)
> -  store <2 x float> %0, <2 x float> addrspace(1)* %out
> +define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
> +  %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
> +  store <2 x float> %result, <2 x float> addrspace(1)* %out
>    ret void
>  }
>  
> -; FUNC-LABEL: v4f32
> +; FUNC-LABEL: {{^}}round_v4f32:
> +; SI: s_endpgm
>  ; R600: CF_END
> -define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
> -entry:
> -  %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in)
> -  store <4 x float> %0, <4 x float> addrspace(1)* %out
> +define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
> +  %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
> +  store <4 x float> %result, <4 x float> addrspace(1)* %out
>    ret void
>  }
>  
> -declare float @llvm.round.f32(float)
> -declare <2 x float> @llvm.round.v2f32(<2 x float>)
> -declare <4 x float> @llvm.round.v4f32(<4 x float>)
> +; FUNC-LABEL: {{^}}round_v8f32:
> +; SI: s_endpgm
> +; R600: CF_END
> +define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
> +  %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
> +  store <8 x float> %result, <8 x float> addrspace(1)* %out
> +  ret void
> +}
> +
> +declare float @llvm.round.f32(float) #1
> +declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
> +declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
> +declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
> +
> +attributes #0 = { nounwind }
> +attributes #1 = { nounwind readnone }

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits




More information about the llvm-commits mailing list