[llvm] 1c9a09f - [DAGCombiner] skip reciprocal divisor optimization for x/sqrt(x), better

Mon Aug 31 06:41:29 PDT 2020

Author: Sanjay Patel
Date: 2020-08-31T09:35:59-04:00
New Revision: 1c9a09f42e5ed66cba04700f9272ff53ea3cca86

URL: https://github.com/llvm/llvm-project/commit/1c9a09f42e5ed66cba04700f9272ff53ea3cca86
DIFF: https://github.com/llvm/llvm-project/commit/1c9a09f42e5ed66cba04700f9272ff53ea3cca86.diff

LOG: [DAGCombiner] skip reciprocal divisor optimization for x/sqrt(x), better

I tried to fix this in:
rG716e35a0cf53
...but that patch depends on the order that we encounter the
magic "x/sqrt(x)" expression in the combiner's worklist.

This patch should improve that by waiting until we walk the
user list to decide if there's a use to skip.

The AArch64 test reveals another (existing) ordering problem
though - we may try to create an estimate for plain sqrt(x)
before we see that it is part of a 1/sqrt(x) expression.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
    llvm/test/CodeGen/X86/sqrt-fastmath.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 650f68d764a0..a570581e89bb 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13235,11 +13235,6 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
   if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
     return SDValue();
 
-  // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
-  if (N1.getOpcode() == ISD::FSQRT && N1.getOperand(0) == N0 &&
-      Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())
-    return SDValue();
-
   // Exit early if the target does not want this transform or if there can't
   // possibly be enough uses of the divisor to make the transform worthwhile.
   unsigned MinUses = TLI.combineRepeatedFPDivisors();
@@ -13259,6 +13254,13 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
   SetVector<SDNode *> Users;
   for (auto *U : N1->uses()) {
     if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
+      // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
+      if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
+          U->getOperand(0) == U->getOperand(1).getOperand(0) &&
+          U->getFlags().hasAllowReassociation() &&
+          U->getFlags().hasNoSignedZeros())
+        continue;
+
       // This division is eligible for optimization only if global unsafe math
       // is enabled or if this division allows reciprocal formation.
       if (UnsafeMath || U->getFlags().hasAllowReciprocal())
@@ -13470,6 +13472,10 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
     return SDValue();
 
   // FSQRT nodes have flags that propagate to the created nodes.
+  // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
+  //       transform the fdiv, we may produce a sub-optimal estimate sequence
+  //       because the reciprocal calculation may not have to filter out a
+  //       0.0 input.
   return buildSqrtEstimate(N0, Flags);
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
index 85886938a186..9f2c0e432446 100644
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -570,19 +570,16 @@ define double @sqrt_simplify_before_recip_3_uses(double %x, double* %p1, double*
 define double @sqrt_simplify_before_recip_3_uses_order(double %x, double* %p1, double* %p2) nounwind {
 ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order:
 ; FAULT:       // %bb.0:
+; FAULT-NEXT:    mov x9, #140737488355328
 ; FAULT-NEXT:    mov x8, #4631107791820423168
-; FAULT-NEXT:    fmov d3, x8
-; FAULT-NEXT:    mov x8, #140737488355328
-; FAULT-NEXT:    fsqrt d1, d0
-; FAULT-NEXT:    fmov d2, #1.00000000
-; FAULT-NEXT:    movk x8, #16453, lsl #48
-; FAULT-NEXT:    fdiv d1, d2, d1
-; FAULT-NEXT:    fmov d2, x8
-; FAULT-NEXT:    fmul d0, d0, d1
-; FAULT-NEXT:    fmul d3, d1, d3
-; FAULT-NEXT:    fmul d1, d1, d2
-; FAULT-NEXT:    str d3, [x0]
-; FAULT-NEXT:    str d1, [x1]
+; FAULT-NEXT:    movk x9, #16453, lsl #48
+; FAULT-NEXT:    fsqrt d0, d0
+; FAULT-NEXT:    fmov d1, x8
+; FAULT-NEXT:    fmov d2, x9
+; FAULT-NEXT:    fdiv d1, d1, d0
+; FAULT-NEXT:    fdiv d2, d2, d0
+; FAULT-NEXT:    str d1, [x0]
+; FAULT-NEXT:    str d2, [x1]
 ; FAULT-NEXT:    ret
 ;
 ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order:
@@ -644,21 +641,24 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, double* %p1, double*
 ; CHECK-NEXT:    fmul d1, d1, d3
 ; CHECK-NEXT:    fmul d3, d1, d1
 ; CHECK-NEXT:    frsqrts d3, d0, d3
-; CHECK-NEXT:    mov x8, #4631107791820423168
 ; CHECK-NEXT:    fmul d1, d1, d3
+; CHECK-NEXT:    mov x8, #4631107791820423168
+; CHECK-NEXT:    fmul d3, d1, d1
 ; CHECK-NEXT:    fmov d2, x8
 ; CHECK-NEXT:    mov x8, #140737488355328
-; CHECK-NEXT:    fmul d3, d1, d1
-; CHECK-NEXT:    movk x8, #16453, lsl #48
 ; CHECK-NEXT:    frsqrts d3, d0, d3
+; CHECK-NEXT:    movk x8, #16453, lsl #48
 ; CHECK-NEXT:    fmul d1, d1, d3
-; CHECK-NEXT:    fmov d3, x8
+; CHECK-NEXT:    fcmp d0, #0.0
+; CHECK-NEXT:    fmov d4, x8
+; CHECK-NEXT:    fmul d3, d0, d1
 ; CHECK-NEXT:    fmul d2, d1, d2
-; CHECK-NEXT:    fmul d3, d1, d3
-; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    fmul d4, d1, d4
 ; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    fcsel d1, d0, d3, eq
+; CHECK-NEXT:    fdiv d0, d0, d1
 ; CHECK-NEXT:    str d2, [x1]
-; CHECK-NEXT:    str d3, [x2]
+; CHECK-NEXT:    str d4, [x2]
 ; CHECK-NEXT:    ret
   %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
   %rsqrt = fdiv fast double 1.0, %sqrt

diff  --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 9735e46eb9c9..e51ef05580c0 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -997,21 +997,17 @@ define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, <2 x double
 define double @sqrt_simplify_before_recip_order(double %x, double* %p) nounwind {
 ; SSE-LABEL: sqrt_simplify_before_recip_order:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    sqrtsd %xmm0, %xmm1
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT:    divsd %xmm1, %xmm2
-; SSE-NEXT:    mulsd %xmm2, %xmm0
-; SSE-NEXT:    mulsd {{.*}}(%rip), %xmm2
-; SSE-NEXT:    movsd %xmm2, (%rdi)
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movsd %xmm1, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrt_simplify_before_recip_order:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vdivsd %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmulsd {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vmovsd %xmm1, (%rdi)
 ; AVX-NEXT:    retq
   %sqrt = tail call fast double @llvm.sqrt.f64(double %x)