# [llvm] r219139 - Fast-math fold: x / (y * sqrt(z)) -> x * (rsqrt(z) / y)

Sanjay Patel spatel at rotateright.com
Mon Oct 6 12:31:18 PDT 2014

```Author: spatel
Date: Mon Oct  6 14:31:18 2014
New Revision: 219139

URL: http://llvm.org/viewvc/llvm-project?rev=219139&view=rev
Log:
Fast-math fold:  x / (y * sqrt(z)) -> x * (rsqrt(z) / y)

The motivation is to recognize code such as this from /llvm/projects/test-suite/SingleSource/Benchmarks/BenchmarkGame/n-body.c:

float distance = sqrt(dx * dx + dy * dy + dz * dz);
float mag = dt / (distance * distance * distance);

Without this patch, we don't match the sqrt as a reciprocal sqrt, so for PPC the new testcase in this patch produces:

addis 3, 2, .LCPI4_2 at toc@ha
lfs 4, .LCPI4_2 at toc@l(3)
addis 3, 2, .LCPI4_1 at toc@ha
lfs 0, .LCPI4_1 at toc@l(3)
fcmpu 0, 1, 4
beq 0, .LBB4_2
# BB#1:
frsqrtes 4, 1
addis 3, 2, .LCPI4_0 at toc@ha
lfs 5, .LCPI4_0 at toc@l(3)
fnmsubs 13, 1, 5, 1
fmuls 6, 4, 4
fmuls 1, 4, 1
fres 4, 1                <--- reciprocal of reciprocal square root
fnmsubs 1, 1, 4, 0
.LBB4_2:
fmuls 1, 4, 2
fres 2, 1
fnmsubs 0, 1, 2, 0
fmuls 1, 3, 0
blr

After the patch, this simplifies to:

frsqrtes 0, 1
addis 3, 2, .LCPI4_1 at toc@ha
fres 5, 2
lfs 4, .LCPI4_1 at toc@l(3)
addis 3, 2, .LCPI4_0 at toc@ha
lfs 7, .LCPI4_0 at toc@l(3)
fnmsubs 13, 1, 4, 1
fmuls 6, 0, 0
fnmsubs 2, 2, 5, 7
fmuls 0, 0, 1
fmuls 0, 0, 2
fmuls 1, 3, 0
blr

Differential Revision: http://reviews.llvm.org/D5628

Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/test/CodeGen/PowerPC/recipest.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=219139&r1=219138&r2=219139&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Mon Oct  6 14:31:18 2014
@@ -7036,6 +7036,28 @@ SDValue DAGCombiner::visitFDIV(SDNode *N
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
}
+    } else if (N1.getOpcode() == ISD::FMUL) {
+      // Look through an FMUL. Even though this won't remove the FDIV directly,
+      // it's still worthwhile to get rid of the FSQRT if possible.
+      SDValue SqrtOp;
+      SDValue OtherOp;
+      if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(0);
+        OtherOp = N1.getOperand(1);
+      } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(1);
+        OtherOp = N1.getOperand(0);
+      }
+      if (SqrtOp.getNode()) {
+        // We found a FSQRT, so try to make this fold:
+        // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
+        if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) {
+          RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+        }
+      }
}

// Fold into a reciprocal estimate and multiply instead of a real divide.

Modified: llvm/trunk/test/CodeGen/PowerPC/recipest.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/recipest.ll?rev=219139&r1=219138&r2=219139&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/recipest.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/recipest.ll Mon Oct  6 14:31:18 2014
@@ -96,6 +96,34 @@ define float @goo(float %a, float %b) no
; CHECK-SAFE: blr
}

+; Recognize that this is rsqrt(a) * rcp(b) * c,
+; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
+define float @rsqrt_fmul(float %a, float %b, float %c) {
+  %x = call float @llvm.sqrt.f32(float %a)
+  %y = fmul float %x, %b
+  %z = fdiv float %c, %y
+  ret float %z
+
+; CHECK: @rsqrt_fmul
+; CHECK-DAG: frsqrtes
+; CHECK-DAG: fres
+; CHECK-DAG: fnmsubs
+; CHECK-DAG: fmuls
+; CHECK-DAG: fnmsubs
+; CHECK: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
+
+; CHECK-SAFE: @rsqrt_fmul
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: fmuls
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind {
%x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
%r = fdiv <4 x float> %a, %x

```