[llvm] r359398 - [DAGCombiner] try repeated fdiv divisor transform before building estimate
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 28 05:23:43 PDT 2019
Author: spatel
Date: Sun Apr 28 05:23:43 2019
New Revision: 359398
URL: http://llvm.org/viewvc/llvm-project?rev=359398&view=rev
Log:
[DAGCombiner] try repeated fdiv divisor transform before building estimate
This was originally part of D61028, but it's an independent diff.
If we try the repeated divisor reciprocal transform before producing an estimate sequence,
then we have an opportunity to use scalar fdiv. On x86, the trade-off is 1 divss vs. 5
vector FP ops in the default estimate sequence. On recent chips (Skylake, Ryzen), the
full-precision division is only 3 cycle throughput, so that's probably the better perf
default option and avoids problems from x86's inaccurate estimates.
The last 2 tests show that users still have the option to override the defaults by using
the function attributes for reciprocal estimates, but those patterns are potentially made
faster by converting the vector ops (including ymm ops) to scalar math.
Differential Revision: https://reviews.llvm.org/D61149
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/test/CodeGen/X86/fdiv-combine-vec.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=359398&r1=359397&r2=359398&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Sun Apr 28 05:23:43 2019
@@ -11992,6 +11992,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N
if (SDValue NewSel = foldBinOpIntoSelect(N))
return NewSel;
+ if (SDValue V = combineRepeatedFPDivisors(N))
+ return V;
+
if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (N1CFP) {
@@ -12081,9 +12084,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N
}
}
- if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
- return CombineRepeatedDivisors;
-
return SDValue();
}
Modified: llvm/trunk/test/CodeGen/X86/fdiv-combine-vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fdiv-combine-vec.ll?rev=359398&r1=359397&r2=359398&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fdiv-combine-vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fdiv-combine-vec.ll Sun Apr 28 05:23:43 2019
@@ -51,25 +51,17 @@ define <4 x double> @splat_fdiv_v4f64(<4
define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
; SSE-LABEL: splat_fdiv_v4f32:
; SSE: # %bb.0:
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: rcpps %xmm1, %xmm2
-; SSE-NEXT: mulps %xmm2, %xmm1
-; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: subps %xmm1, %xmm3
-; SSE-NEXT: mulps %xmm2, %xmm3
-; SSE-NEXT: addps %xmm2, %xmm3
-; SSE-NEXT: mulps %xmm3, %xmm0
+; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: divss %xmm1, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE-NEXT: mulps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splat_fdiv_v4f32:
; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX-NEXT: vrcpps %xmm1, %xmm2
-; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vy = insertelement <4 x float> undef, float %y, i32 0
@@ -90,14 +82,10 @@ define <8 x float> @splat_fdiv_v8f32(<8
;
; AVX-LABEL: splat_fdiv_v8f32:
; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX-NEXT: vrcpps %ymm1, %ymm2
-; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
-; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vy = insertelement <8 x float> undef, float %y, i32 0
@@ -109,25 +97,25 @@ define <8 x float> @splat_fdiv_v8f32(<8
define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
; SSE-LABEL: splat_fdiv_v4f32_estimate:
; SSE: # %bb.0:
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: rcpps %xmm1, %xmm2
-; SSE-NEXT: mulps %xmm2, %xmm1
-; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT: subps %xmm1, %xmm3
-; SSE-NEXT: mulps %xmm2, %xmm3
-; SSE-NEXT: addps %xmm2, %xmm3
+; SSE-NEXT: rcpss %xmm1, %xmm2
+; SSE-NEXT: mulss %xmm2, %xmm1
+; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT: subss %xmm1, %xmm3
+; SSE-NEXT: mulss %xmm2, %xmm3
+; SSE-NEXT: addss %xmm2, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,0,0]
; SSE-NEXT: mulps %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splat_fdiv_v4f32_estimate:
; AVX: # %bb.0:
+; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
+; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX-NEXT: vrcpps %xmm1, %xmm2
-; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vy = insertelement <4 x float> undef, float %y, i32 0
@@ -152,14 +140,14 @@ define <8 x float> @splat_fdiv_v8f32_est
;
; AVX-LABEL: splat_fdiv_v8f32_estimate:
; AVX: # %bb.0:
+; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
+; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX-NEXT: vrcpps %ymm1, %ymm2
-; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
-; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vy = insertelement <8 x float> undef, float %y, i32 0
More information about the llvm-commits
mailing list