[llvm] 716e35a - [DAGCombiner] skip reciprocal divisor optimization for x/sqrt(x)

Sun Aug 30 08:01:42 PDT 2020

Author: Sanjay Patel
Date: 2020-08-30T10:55:45-04:00
New Revision: 716e35a0cf53e85a5fddc7ff86b79a751b1b2040

URL: https://github.com/llvm/llvm-project/commit/716e35a0cf53e85a5fddc7ff86b79a751b1b2040
DIFF: https://github.com/llvm/llvm-project/commit/716e35a0cf53e85a5fddc7ff86b79a751b1b2040.diff

LOG: [DAGCombiner] skip reciprocal divisor optimization for x/sqrt(x)

In general, we probably want to try the multi-use reciprocal
transform before sqrt transforms, but x/sqrt(x) is a special-case
because that will always reduce to plain sqrt(x) or an estimate.

The AArch64 tests show that the transform is limited by TLI
hook to patterns where there are 3 or more uses of the divisor.
So this change can result in an extra division compared to
what we had, but that's the intended behvior based on the
current setting of that hook.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
    llvm/test/CodeGen/X86/sqrt-fastmath.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ddf818784ee3..650f68d764a0 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13230,14 +13230,18 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
     return SDValue();
 
   // Skip if current node is a reciprocal/fneg-reciprocal.
-  SDValue N0 = N->getOperand(0);
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
   if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
     return SDValue();
 
+  // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
+  if (N1.getOpcode() == ISD::FSQRT && N1.getOperand(0) == N0 &&
+      Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())
+    return SDValue();
+
   // Exit early if the target does not want this transform or if there can't
   // possibly be enough uses of the divisor to make the transform worthwhile.
-  SDValue N1 = N->getOperand(1);
   unsigned MinUses = TLI.combineRepeatedFPDivisors();
 
   // For splat vectors, scale the number of uses by the splat factor. If we can

diff  --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
index 7fa64073bb4e..ccc100e2b86f 100644
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -529,13 +529,12 @@ define double @sqrt_fdiv_common_operand_extra_use(double %x, double* %p) nounwin
 define double @sqrt_simplify_before_recip_3_uses(double %x, double* %p1, double* %p2) nounwind {
 ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses:
 ; FAULT:       // %bb.0:
-; FAULT-NEXT:    fsqrt d1, d0
-; FAULT-NEXT:    fmov d2, #1.00000000
 ; FAULT-NEXT:    mov x8, #4631107791820423168
-; FAULT-NEXT:    fdiv d1, d2, d1
+; FAULT-NEXT:    fsqrt d0, d0
+; FAULT-NEXT:    fmov d1, #1.00000000
 ; FAULT-NEXT:    fmov d2, x8
-; FAULT-NEXT:    fmul d2, d1, d2
-; FAULT-NEXT:    fmul d0, d0, d1
+; FAULT-NEXT:    fdiv d1, d1, d0
+; FAULT-NEXT:    fdiv d2, d2, d0
 ; FAULT-NEXT:    str d1, [x0]
 ; FAULT-NEXT:    str d2, [x1]
 ; FAULT-NEXT:    ret
@@ -572,19 +571,18 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, double* %p1, double*
 ; FAULT-LABEL: sqrt_simplify_before_recip_4_uses:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    mov x8, #4631107791820423168
-; FAULT-NEXT:    fmov d3, x8
+; FAULT-NEXT:    fmov d2, x8
 ; FAULT-NEXT:    mov x8, #140737488355328
-; FAULT-NEXT:    fsqrt d1, d0
-; FAULT-NEXT:    fmov d2, #1.00000000
+; FAULT-NEXT:    fsqrt d0, d0
+; FAULT-NEXT:    fmov d1, #1.00000000
 ; FAULT-NEXT:    movk x8, #16453, lsl #48
-; FAULT-NEXT:    fdiv d1, d2, d1
-; FAULT-NEXT:    fmov d2, x8
-; FAULT-NEXT:    fmul d3, d1, d3
+; FAULT-NEXT:    fdiv d1, d1, d0
+; FAULT-NEXT:    fmov d3, x8
 ; FAULT-NEXT:    fmul d2, d1, d2
-; FAULT-NEXT:    fmul d0, d0, d1
+; FAULT-NEXT:    fmul d3, d1, d3
 ; FAULT-NEXT:    str d1, [x0]
-; FAULT-NEXT:    str d3, [x1]
-; FAULT-NEXT:    str d2, [x2]
+; FAULT-NEXT:    str d2, [x1]
+; FAULT-NEXT:    str d3, [x2]
 ; FAULT-NEXT:    ret
 ;
 ; CHECK-LABEL: sqrt_simplify_before_recip_4_uses:

diff  --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index e44eb613eb13..f69b375b062c 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -951,19 +951,17 @@ define double @sqrt_fdiv_common_operand_extra_use(double %x, double* %p) nounwin
 define double @sqrt_simplify_before_recip(double %x, double* %p) nounwind {
 ; SSE-LABEL: sqrt_simplify_before_recip:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    sqrtsd %xmm0, %xmm1
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT:    divsd %xmm1, %xmm2
-; SSE-NEXT:    mulsd %xmm2, %xmm0
-; SSE-NEXT:    movsd %xmm2, (%rdi)
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movsd %xmm1, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrt_simplify_before_recip:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vdivsd %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vmovsd %xmm1, (%rdi)
 ; AVX-NEXT:    retq
   %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
@@ -976,19 +974,17 @@ define double @sqrt_simplify_before_recip(double %x, double* %p) nounwind {
 define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, <2 x double>* %p) nounwind {
 ; SSE-LABEL: sqrt_simplify_before_recip_vec:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    sqrtpd %xmm0, %xmm1
-; SSE-NEXT:    movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0]
-; SSE-NEXT:    divpd %xmm1, %xmm2
-; SSE-NEXT:    mulpd %xmm2, %xmm0
-; SSE-NEXT:    movupd %xmm2, (%rdi)
+; SSE-NEXT:    sqrtpd %xmm0, %xmm0
+; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; SSE-NEXT:    divpd %xmm0, %xmm1
+; SSE-NEXT:    movupd %xmm1, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrt_simplify_before_recip_vec:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtpd %xmm0, %xmm1
-; AVX-NEXT:    vmovapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0]
-; AVX-NEXT:    vdivpd %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
+; AVX-NEXT:    vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vmovupd %xmm1, (%rdi)
 ; AVX-NEXT:    retq
   %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)