[llvm] r307308 - [AMDGPU] Always use rcp + mul with fast math

Thu Jul 6 13:34:21 PDT 2017

Author: rampitec
Date: Thu Jul  6 13:34:21 2017
New Revision: 307308

URL: http://llvm.org/viewvc/llvm-project?rev=307308&view=rev
Log:
[AMDGPU] Always use rcp + mul with fast math

Regardless of relaxation options such as -cl-fast-relaxed-math
we are producing rather long code for fdiv via amdgcn_fdiv_fast
intrinsic. This intrinsic is used to replace fdiv with 2.5ulp
metadata and does not handle denormals, thus believed to be fast.

An fdiv instruction can also have fast math flag either by itself
or together with fpmath metadata. Clang used with a relaxation flag
always produces both metadata and fast flag:

%div = fdiv fast float %v, %0, !fpmath !12
!12 = !{float 2.500000e+00}

Current implementation ignores fast flag and favors metadata. An
instruction with just fast flag would be lowered to a fastest rcp +
mul, but that never happen on practice because of described mutual
clang and BE behavior.

This change allows an "fdiv fast" to be always lowered as rcp + mul.

Differential Revision: https://reviews.llvm.org/D34844

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
    llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp?rev=307308&r1=307307&r2=307308&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Thu Jul  6 13:34:21 2017
@@ -380,7 +380,9 @@ bool AMDGPUCodeGenPrepare::visitFDiv(Bin
   FastMathFlags FMF = FPOp->getFastMathFlags();
   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
                                       FMF.allowReciprocal();
-  if (ST->hasFP32Denormals() && !UnsafeDiv)
+
+  // With UnsafeDiv node will be optimized to just rcp and mul.
+  if (ST->hasFP32Denormals() || UnsafeDiv)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=307308&r1=307307&r2=307308&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Thu Jul  6 13:34:21 2017
@@ -3736,7 +3736,9 @@ SDValue SITargetLowering::lowerFastUnsaf
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+  const SDNodeFlags Flags = Op->getFlags();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
+                Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
 
   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
     return SDValue();
@@ -3771,15 +3773,11 @@ SDValue SITargetLowering::lowerFastUnsaf
     }
   }
 
-  const SDNodeFlags Flags = Op->getFlags();
-
-  if (Unsafe || Flags.hasAllowReciprocal()) {
+  if (Unsafe) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
-    SDNodeFlags NewFlags;
-    NewFlags.setUnsafeAlgebra(true);
     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, NewFlags);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
   }
 
   return SDValue();

Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll?rev=307308&r1=307307&r2=307308&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll Thu Jul  6 13:34:21 2017
@@ -16,8 +16,8 @@ define amdgpu_kernel void @noop_fdiv_fpm
 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
 ; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
-; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
@@ -110,15 +110,8 @@ define amdgpu_kernel void @fdiv_fpmath_v
 ; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
 ; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
 ; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
@@ -146,17 +139,8 @@ define amdgpu_kernel void @rcp_fdiv_fpma
 ; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
 ; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
 ; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
@@ -179,12 +163,10 @@ define amdgpu_kernel void @rcp_fdiv_fpma
 
 ; FIXME: Should be able to get fdiv for 1.0 component
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
 ; CHECK: store volatile <2 x float> %arcp.25ulp
 
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
@@ -204,8 +186,8 @@ define amdgpu_kernel void @rcp_fdiv_fpma
 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
 ; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll?rev=307308&r1=307307&r2=307308&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll Thu Jul  6 13:34:21 2017
@@ -85,20 +85,11 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
-; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
-; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
-; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
-
-; GCN-NOT: s_setreg
-; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
-; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
-; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
-; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
-; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
-; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
 ; GCN-NOT: s_setreg
-; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
-; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv fast float %a, %b
@@ -121,6 +112,21 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
+
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+entry:
+  %fdiv = fdiv fast float %a, %b, !fpmath !0
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
 ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
@@ -154,8 +160,9 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
-; GCN: v_cmp_gt_f32
-; GCN: v_cmp_gt_f32
+; GCN: v_rcp_f32
+; GCN: v_rcp_f32
+; GCN-NOT: v_cmp_gt_f32
 define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0