[llvm] 1c9a09f - [DAGCombiner] skip reciprocal divisor optimization for x/sqrt(x), better
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 31 06:41:29 PDT 2020
Author: Sanjay Patel
Date: 2020-08-31T09:35:59-04:00
New Revision: 1c9a09f42e5ed66cba04700f9272ff53ea3cca86
URL: https://github.com/llvm/llvm-project/commit/1c9a09f42e5ed66cba04700f9272ff53ea3cca86
DIFF: https://github.com/llvm/llvm-project/commit/1c9a09f42e5ed66cba04700f9272ff53ea3cca86.diff
LOG: [DAGCombiner] skip reciprocal divisor optimization for x/sqrt(x), better
I tried to fix this in:
rG716e35a0cf53
...but that patch depends on the order that we encounter the
magic "x/sqrt(x)" expression in the combiner's worklist.
This patch should improve that by waiting until we walk the
user list to decide if there's a use to skip.
The AArch64 test reveals another (existing) ordering problem
though - we may try to create an estimate for plain sqrt(x)
before we see that it is part of a 1/sqrt(x) expression.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
llvm/test/CodeGen/X86/sqrt-fastmath.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 650f68d764a0..a570581e89bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13235,11 +13235,6 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
return SDValue();
- // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
- if (N1.getOpcode() == ISD::FSQRT && N1.getOperand(0) == N0 &&
- Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())
- return SDValue();
-
// Exit early if the target does not want this transform or if there can't
// possibly be enough uses of the divisor to make the transform worthwhile.
unsigned MinUses = TLI.combineRepeatedFPDivisors();
@@ -13259,6 +13254,13 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
SetVector<SDNode *> Users;
for (auto *U : N1->uses()) {
if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
+ // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
+ if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
+ U->getOperand(0) == U->getOperand(1).getOperand(0) &&
+ U->getFlags().hasAllowReassociation() &&
+ U->getFlags().hasNoSignedZeros())
+ continue;
+
// This division is eligible for optimization only if global unsafe math
// is enabled or if this division allows reciprocal formation.
if (UnsafeMath || U->getFlags().hasAllowReciprocal())
@@ -13470,6 +13472,10 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
return SDValue();
// FSQRT nodes have flags that propagate to the created nodes.
+ // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
+ // transform the fdiv, we may produce a sub-optimal estimate sequence
+ // because the reciprocal calculation may not have to filter out a
+ // 0.0 input.
return buildSqrtEstimate(N0, Flags);
}
diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
index 85886938a186..9f2c0e432446 100644
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -570,19 +570,16 @@ define double @sqrt_simplify_before_recip_3_uses(double %x, double* %p1, double*
define double @sqrt_simplify_before_recip_3_uses_order(double %x, double* %p1, double* %p2) nounwind {
; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order:
; FAULT: // %bb.0:
+; FAULT-NEXT: mov x9, #140737488355328
; FAULT-NEXT: mov x8, #4631107791820423168
-; FAULT-NEXT: fmov d3, x8
-; FAULT-NEXT: mov x8, #140737488355328
-; FAULT-NEXT: fsqrt d1, d0
-; FAULT-NEXT: fmov d2, #1.00000000
-; FAULT-NEXT: movk x8, #16453, lsl #48
-; FAULT-NEXT: fdiv d1, d2, d1
-; FAULT-NEXT: fmov d2, x8
-; FAULT-NEXT: fmul d0, d0, d1
-; FAULT-NEXT: fmul d3, d1, d3
-; FAULT-NEXT: fmul d1, d1, d2
-; FAULT-NEXT: str d3, [x0]
-; FAULT-NEXT: str d1, [x1]
+; FAULT-NEXT: movk x9, #16453, lsl #48
+; FAULT-NEXT: fsqrt d0, d0
+; FAULT-NEXT: fmov d1, x8
+; FAULT-NEXT: fmov d2, x9
+; FAULT-NEXT: fdiv d1, d1, d0
+; FAULT-NEXT: fdiv d2, d2, d0
+; FAULT-NEXT: str d1, [x0]
+; FAULT-NEXT: str d2, [x1]
; FAULT-NEXT: ret
;
; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order:
@@ -644,21 +641,24 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, double* %p1, double*
; CHECK-NEXT: fmul d1, d1, d3
; CHECK-NEXT: fmul d3, d1, d1
; CHECK-NEXT: frsqrts d3, d0, d3
-; CHECK-NEXT: mov x8, #4631107791820423168
; CHECK-NEXT: fmul d1, d1, d3
+; CHECK-NEXT: mov x8, #4631107791820423168
+; CHECK-NEXT: fmul d3, d1, d1
; CHECK-NEXT: fmov d2, x8
; CHECK-NEXT: mov x8, #140737488355328
-; CHECK-NEXT: fmul d3, d1, d1
-; CHECK-NEXT: movk x8, #16453, lsl #48
; CHECK-NEXT: frsqrts d3, d0, d3
+; CHECK-NEXT: movk x8, #16453, lsl #48
; CHECK-NEXT: fmul d1, d1, d3
-; CHECK-NEXT: fmov d3, x8
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: fmov d4, x8
+; CHECK-NEXT: fmul d3, d0, d1
; CHECK-NEXT: fmul d2, d1, d2
-; CHECK-NEXT: fmul d3, d1, d3
-; CHECK-NEXT: fmul d0, d0, d1
+; CHECK-NEXT: fmul d4, d1, d4
; CHECK-NEXT: str d1, [x0]
+; CHECK-NEXT: fcsel d1, d0, d3, eq
+; CHECK-NEXT: fdiv d0, d0, d1
; CHECK-NEXT: str d2, [x1]
-; CHECK-NEXT: str d3, [x2]
+; CHECK-NEXT: str d4, [x2]
; CHECK-NEXT: ret
%sqrt = tail call fast double @llvm.sqrt.f64(double %x)
%rsqrt = fdiv fast double 1.0, %sqrt
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index 9735e46eb9c9..e51ef05580c0 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -997,21 +997,17 @@ define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, <2 x double
define double @sqrt_simplify_before_recip_order(double %x, double* %p) nounwind {
; SSE-LABEL: sqrt_simplify_before_recip_order:
; SSE: # %bb.0:
-; SSE-NEXT: sqrtsd %xmm0, %xmm1
-; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: divsd %xmm1, %xmm2
-; SSE-NEXT: mulsd %xmm2, %xmm0
-; SSE-NEXT: mulsd {{.*}}(%rip), %xmm2
-; SSE-NEXT: movsd %xmm2, (%rdi)
+; SSE-NEXT: sqrtsd %xmm0, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: divsd %xmm0, %xmm1
+; SSE-NEXT: movsd %xmm1, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: sqrt_simplify_before_recip_order:
; AVX: # %bb.0:
-; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT: vdivsd %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1
; AVX-NEXT: vmovsd %xmm1, (%rdi)
; AVX-NEXT: retq
%sqrt = tail call fast double @llvm.sqrt.f64(double %x)
More information about the llvm-commits
mailing list