[llvm] r226682 - R600/SI: Custom lower fround

Wed Jan 21 10:18:25 PST 2015

Author: arsenm
Date: Wed Jan 21 12:18:25 2015
New Revision: 226682

URL: http://llvm.org/viewvc/llvm-project?rev=226682&view=rev
Log:
R600/SI: Custom lower fround

This fixes it for SI. It also removes the pattern
used previously for Evergreen for f32. I'm not sure
if the the new R600 output is better or not, but it uses
1 fewer instructions if BFI is available.

Added:
    llvm/trunk/test/CodeGen/R600/llvm.round.f64.ll
Modified:
    llvm/trunk/lib/Target/R600/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h
    llvm/trunk/lib/Target/R600/EvergreenInstructions.td
    llvm/trunk/lib/Target/R600/R600Instructions.td
    llvm/trunk/test/CodeGen/R600/llvm.round.ll

Modified: llvm/trunk/lib/Target/R600/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUISelLowering.cpp?rev=226682&r1=226681&r2=226682&view=diff
==============================================================================

--- llvm/trunk/lib/Target/R600/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUISelLowering.cpp Wed Jan 21 12:18:25 2015
@@ -127,9 +127,11 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
   setOperationAction(ISD::FABS,   MVT::f32, Legal);
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FROUND, MVT::f32, Legal);
   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 
+  setOperationAction(ISD::FROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
   setOperationAction(ISD::FREM, MVT::f32, Custom);
   setOperationAction(ISD::FREM, MVT::f64, Custom);
 
@@ -610,6 +612,7 @@ SDValue AMDGPUTargetLowering::LowerOpera
   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
   case ISD::FRINT: return LowerFRINT(Op, DAG);
   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+  case ISD::FROUND: return LowerFROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
@@ -1917,6 +1920,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
+static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, MVT::i32),
+                                DAG.getConstant(ExpBits, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, MVT::i32));
+
+  return Exp;
+}
+
 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
@@ -1932,16 +1949,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUN
   // exponent.
   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
 
-  const unsigned FractBits = 52;
-  const unsigned ExpBits = 11;
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
 
-  // Extract the exponent.
-  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
-                                Hi,
-                                DAG.getConstant(FractBits - 32, MVT::i32),
-                                DAG.getConstant(ExpBits, MVT::i32));
-  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
-                            DAG.getConstant(1023, MVT::i32));
+  const unsigned FractBits = 52;
 
   // Extract the sign bit.
   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
@@ -2004,6 +2014,99 @@ SDValue AMDGPUTargetLowering::LowerFNEAR
   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
 }
 
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
+  const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
+
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+  const SDValue Zero = DAG.getConstant(0, MVT::i32);
+  const SDValue One = DAG.getConstant(1, MVT::i32);
+  const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
+  const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
+
+  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+                          DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
+                          Exp);
+
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+                              DAG.getConstant(0, MVT::i64), Tmp0,
+                              ISD::SETNE);
+
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+                             D, DAG.getConstant(0, MVT::i64));
+  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+                            ExpEqNegOne,
+                            DAG.getConstantFP(1.0, MVT::f64),
+                            DAG.getConstantFP(0.0, MVT::f64));
+
+  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+  return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFROUND32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFROUND64(Op, DAG);
+
+  llvm_unreachable("unhandled type");
+}
+
 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);

Modified: llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h?rev=226682&r1=226681&r2=226682&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUISelLowering.h Wed Jan 21 12:18:25 2015
@@ -49,6 +49,10 @@ private:
   SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;

Modified: llvm/trunk/lib/Target/R600/EvergreenInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/EvergreenInstructions.td?rev=226682&r1=226681&r2=226682&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/EvergreenInstructions.td (original)
+++ llvm/trunk/lib/Target/R600/EvergreenInstructions.td Wed Jan 21 12:18:25 2015
@@ -590,8 +590,6 @@ def : Pat<(fp_to_uint f32:$src0), (FLT_T
 // SHA-256 Patterns
 def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
 
-def : FROUNDPat <CNDGE_eg, CNDGT_eg>;
-
 def EG_ExportSwz : ExportSwzInst {
   let Word1{19-16} = 0; // BURST_COUNT
   let Word1{20} = 0; // VALID_PIXEL_MODE

Modified: llvm/trunk/lib/Target/R600/R600Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/R600Instructions.td?rev=226682&r1=226681&r2=226682&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/R600Instructions.td (original)
+++ llvm/trunk/lib/Target/R600/R600Instructions.td Wed Jan 21 12:18:25 2015
@@ -1142,16 +1142,6 @@ class TGSI_LIT_Z_Common <InstR600 mul_li
   (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
 >;
 
-// FROUND pattern
-class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat <
-  (AMDGPUround f32:$x),
-  (CNDGE $x,
-  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)),
-  (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
-  )
->;
-
-
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1195,8 +1185,6 @@ let Predicates = [isR600] in {
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
   defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
 
-  def : FROUNDPat <CNDGE_r600, CNDGT_r600>;
-
   def R600_ExportSwz : ExportSwzInst {
     let Word1{20-17} = 0; // BURST_COUNT
     let Word1{21} = eop;

Added: llvm/trunk/test/CodeGen/R600/llvm.round.f64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/llvm.round.f64.ll?rev=226682&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/R600/llvm.round.f64.ll (added)
+++ llvm/trunk/test/CodeGen/R600/llvm.round.f64.ll Wed Jan 21 12:18:25 2015
@@ -0,0 +1,74 @@
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}round_f64:
+; SI: s_endpgm
+define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; This is a pretty large function, so just test a few of the
+; instructions that are necessary.
+
+; FUNC-LABEL: {{^}}v_round_f64:
+; SI: buffer_load_dwordx2
+; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11
+
+; SI: v_not_b32_e32
+; SI: v_not_b32_e32
+
+; SI: v_cmp_eq_i32
+
+; SI: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff
+; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]]
+
+; SI: v_cmp_lt_i32_e64
+; SI: v_cmp_gt_i32_e64
+
+
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %out.gep = getelementptr double addrspace(1)* %out, i32 %tid
+  %x = load double addrspace(1)* %gep
+  %result = call double @llvm.round.f64(double %x) #1
+  store double %result, double addrspace(1)* %out.gep
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v2f64:
+; SI: s_endpgm
+define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
+  store <2 x double> %result, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f64:
+; SI: s_endpgm
+define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
+  store <4 x double> %result, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v8f64:
+; SI: s_endpgm
+define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
+  store <8 x double> %result, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+declare double @llvm.round.f64(double) #1
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
+declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
+declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

Modified: llvm/trunk/test/CodeGen/R600/llvm.round.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/llvm.round.ll?rev=226682&r1=226681&r2=226682&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/llvm.round.ll (original)
+++ llvm/trunk/test/CodeGen/R600/llvm.round.ll Wed Jan 21 12:18:25 2015
@@ -1,17 +1,27 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: {{^}}f32:
-; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
-; R600-DAG: ADD  {{.*}}, -0.5
-; R600-DAG: CEIL {{.*}} [[ARG]]
-; R600-DAG: FLOOR {{.*}} [[ARG]]
-; R600-DAG: CNDGE
-; R600-DAG: CNDGT
-; R600: CNDGE {{[^,]+}}, [[ARG]]
-define void @f32(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = call float @llvm.round.f32(float %in)
-  store float %0, float addrspace(1)* %out
+; FUNC-LABEL: {{^}}round_f32:
+; SI-DAG: s_load_dword [[SX:s[0-9]+]]
+; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff
+; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
+; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; SI: v_cmp_ge_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SUB]]|, 0.5
+; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; SI: buffer_store_dword [[RESULT]]
+
+; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
+; R600-DAG: ADD  {{.*}},
+; R600-DAG: BFI_INT
+; R600-DAG: SETGE
+; R600-DAG: CNDE
+; R600-DAG: ADD
+define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+  %result = call float @llvm.round.f32(float %x) #1
+  store float %result, float addrspace(1)* %out
   ret void
 }
 
@@ -20,24 +30,37 @@ entry:
 ; a test for the scalar case, so the vector tests just check that the
 ; compiler doesn't crash.
 
-; FUNC-LABEL: v2f32
+; FUNC-LABEL: {{^}}round_v2f32:
+; SI: s_endpgm
+; R600: CF_END
+define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+  %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}round_v4f32:
+; SI: s_endpgm
 ; R600: CF_END
-define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+  %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: v4f32
+; FUNC-LABEL: {{^}}round_v8f32:
+; SI: s_endpgm
 ; R600: CF_END
-define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+  %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
+  store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
 }
 
-declare float @llvm.round.f32(float)
-declare <2 x float> @llvm.round.v2f32(<2 x float>)
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
+declare float @llvm.round.f32(float) #1
+declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
+declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }