[llvm] r334862 - Utilize new SDNode flag functionality to expand current support for fdiv

Fri Jun 15 13:44:55 PDT 2018

Author: mcberg2017
Date: Fri Jun 15 13:44:55 2018
New Revision: 334862

URL: http://llvm.org/viewvc/llvm-project?rev=334862&view=rev
Log:
Utilize new SDNode flag functionality to expand current support for fdiv

Summary: This patch originated from D46562 and is a proper subset, with some issues addressed.

Reviewers: spatel, hfinkel, wristow, arsenm

Reviewed By: spatel

Subscribers: wdng, nhaehnle

Differential Revision: https://reviews.llvm.org/D47954

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll
    llvm/trunk/test/CodeGen/X86/fmf-flags.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=334862&r1=334861&r2=334862&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Fri Jun 15 13:44:55 2018
@@ -10865,7 +10865,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
-  if (Options.UnsafeFPMath) {
+  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
     if (N1CFP) {
       // Compute the reciprocal 1.0 / c2.

Modified: llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll?rev=334862&r1=334861&r2=334862&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll Fri Jun 15 13:44:55 2018
@@ -218,7 +218,7 @@ define amdgpu_kernel void @div_arcp_2_x_
 }
 
 ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
-; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
 
 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
 ; GFX8_9: buffer_store_short [[MUL]]
@@ -230,7 +230,7 @@ define amdgpu_kernel void @div_arcp_k_x_
 }
 
 ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
-; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
 
 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
 ; GFX8_9: buffer_store_short [[MUL]]

Modified: llvm/trunk/test/CodeGen/X86/fmf-flags.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fmf-flags.ll?rev=334862&r1=334861&r2=334862&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fmf-flags.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fmf-flags.ll Fri Jun 15 13:44:55 2018
@@ -8,17 +8,11 @@ define float @fast_recip_sqrt(float %x)
 ; X64-LABEL: fast_recip_sqrt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    rsqrtss %xmm0, %xmm1
-; X64-NEXT:    xorps   %xmm2, %xmm2
-; X64-NEXT:    cmpeqss %xmm0, %xmm2
 ; X64-NEXT:    mulss   %xmm1, %xmm0
-; X64-NEXT:    movss   {{.*}}(%rip), %xmm3
-; X64-NEXT:    mulss   %xmm0, %xmm3
 ; X64-NEXT:    mulss   %xmm1, %xmm0
 ; X64-NEXT:    addss   {{.*}}(%rip), %xmm0
-; X64-NEXT:    mulss   %xmm3, %xmm0
-; X64-NEXT:    andnps  %xmm0, %xmm2
-; X64-NEXT:    movss   {{.*}}(%rip), %xmm0
-; X64-NEXT:    divss   %xmm2, %xmm0
+; X64-NEXT:    mulss   {{.*}}(%rip), %xmm1
+; X64-NEXT:    mulss   %xmm1, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: fast_recip_sqrt:
@@ -89,10 +83,14 @@ define double @not_so_fast_mul_add(doubl
 define float @not_so_fast_recip_sqrt(float %x) {
 ; X64-LABEL: not_so_fast_recip_sqrt:
 ; X64:       # %bb.0:
-; X64-NEXT:    sqrtss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    divss %xmm1, %xmm0
-; X64-NEXT:    movss %xmm1, {{.*}}(%rip)
+; X64-NEXT:    rsqrtss %xmm0, %xmm1
+; X64-NEXT:    sqrtss  %xmm0, %xmm2
+; X64-NEXT:    mulss   %xmm1, %xmm0
+; X64-NEXT:    mulss   %xmm1, %xmm0
+; X64-NEXT:    addss   {{.*}}(%rip), %xmm0
+; X64-NEXT:    mulss   {{.*}}(%rip), %xmm1
+; X64-NEXT:    mulss   %xmm1, %xmm0
+; X64-NEXT:    movss   %xmm2, sqrt1(%rip)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: not_so_fast_recip_sqrt:
@@ -111,3 +109,19 @@ define float @not_so_fast_recip_sqrt(flo
   ret float %z
 }
 
+define float @div_arcp_by_const(half %x) {
+; X64-LABEL: .LCPI4_0:
+; X64-NEXT:  .long 1036828672
+; X64-LABEL: div_arcp_by_const:
+; X64:       movzwl %ax, %edi
+; X64:       mulss .LCPI4_0(%rip), %xmm0 
+;
+; X86-LABEL: .LCPI4_0:
+; X86-NEXT:  .long 1036828672
+; X86-LABEL: div_arcp_by_const:
+; X86:       movzwl %ax, %eax
+; X86:       fmuls .LCPI4_0
+  %rcp = fdiv arcp half %x, 10.0
+  %z = fpext half %rcp to float
+  ret float %z
+}