[llvm] r323981 - [DAGCombiner] filter out denorm inputs when calculating sqrt estimate (PR34994)

Thu Feb 1 08:57:18 PST 2018

Author: spatel
Date: Thu Feb  1 08:57:18 2018
New Revision: 323981

URL: http://llvm.org/viewvc/llvm-project?rev=323981&view=rev
Log:
[DAGCombiner] filter out denorm inputs when calculating sqrt estimate (PR34994)

As shown in the example in PR34994:
https://bugs.llvm.org/show_bug.cgi?id=34994
...we can return a very wrong answer (inf instead of 0.0) for square root when 
using a reciprocal square root estimate instruction.

Here, I've conditionalized the filtering out of denorms based on the function 
having "denormal-fp-math"="ieee" in its attributes. The other options for this 
attribute are 'preserve-sign' and 'positive-zero'.

So we don't generate this extra code by default with just '-ffast-math' (because 
then there's no denormal attribute string at all), but it works if you specify 
'-ffast-math -fdenormal-fp-math=ieee' from clang. 

As noted in the review, there may be other problems in clang that affect the 
results depending on platform (Linux x86 at least), but this should allow 
creating the desired codegen.

Differential Revision: https://reviews.llvm.org/D42323


Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=323981&r1=323980&r2=323981&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Feb  1 08:57:18 2018
@@ -17454,19 +17454,34 @@ SDValue DAGCombiner::buildSqrtEstimateIm
             : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
 
       if (!Reciprocal) {
-        // Unfortunately, Est is now NaN if the input was exactly 0.0.
-        // Select out this case and force the answer to 0.0.
+        // The estimate is now completely wrong if the input was exactly 0.0 or
+        // possibly a denormal. Force the answer to 0.0 for those cases.
         EVT VT = Op.getValueType();
         SDLoc DL(Op);
-
-        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
         EVT CCVT = getSetCCResultType(VT);
-        SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
-        AddToWorklist(ZeroCmp.getNode());
-
-        Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
-                          ZeroCmp, FPZero, Est);
-        AddToWorklist(Est.getNode());
+        ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+        const Function &F = DAG.getMachineFunction().getFunction();
+        Attribute Denorms = F.getFnAttribute("denormal-fp-math");
+        if (Denorms.getValueAsString().equals("ieee")) {
+          // fabs(X) < SmallestNormal ? 0.0 : Est
+          const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+          APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
+          SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
+          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+          SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
+          SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
+          Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
+          AddToWorklist(Fabs.getNode());
+          AddToWorklist(IsDenorm.getNode());
+          AddToWorklist(Est.getNode());
+        } else {
+          // X == 0.0 ? 0.0 : Est
+          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+          SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+          Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
+          AddToWorklist(IsZero.getNode());
+          AddToWorklist(Est.getNode());
+        }
       }
     }
     return Est;

Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=323981&r1=323980&r2=323981&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Thu Feb  1 08:57:18 2018
@@ -5007,7 +5007,9 @@ SDValue AArch64TargetLowering::getSqrtEs
         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
-
+      // FIXME: This does not detect denorm inputs, so we might produce INF
+      // when we should produce 0.0. Try to refactor the code in DAGCombiner,
+      // so we don't have to duplicate it here.
       if (!Reciprocal) {
         EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                       VT);

Modified: llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll?rev=323981&r1=323980&r2=323981&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll Thu Feb  1 08:57:18 2018
@@ -121,8 +121,8 @@ define float @sqrtf_check_denorms(float
 ; SSE-NEXT:    mulss %xmm1, %xmm2
 ; SSE-NEXT:    addss {{.*}}(%rip), %xmm2
 ; SSE-NEXT:    mulss %xmm3, %xmm2
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    cmpeqss %xmm1, %xmm0
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    cmpltss {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    andnps %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -134,8 +134,8 @@ define float @sqrtf_check_denorms(float
 ; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
 ; AVX-NEXT:    vmulss %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vcmpeqss %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vcmpltss {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vandnps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %call = tail call float @__sqrtf_finite(float %x) #2
@@ -145,17 +145,19 @@ define float @sqrtf_check_denorms(float
 define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; SSE-LABEL: sqrt_v4f32_check_denorms:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    rsqrtps %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    mulps %xmm1, %xmm2
+; SSE-NEXT:    rsqrtps %xmm0, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01]
-; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    mulps %xmm1, %xmm2
-; SSE-NEXT:    addps {{.*}}(%rip), %xmm2
-; SSE-NEXT:    mulps %xmm3, %xmm2
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    cmpneqps %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    mulps %xmm1, %xmm3
+; SSE-NEXT:    mulps %xmm2, %xmm1
+; SSE-NEXT:    addps {{.*}}(%rip), %xmm1
+; SSE-NEXT:    mulps %xmm3, %xmm1
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38]
+; SSE-NEXT:    cmpleps %xmm0, %xmm2
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sqrt_v4f32_check_denorms:
@@ -166,8 +168,9 @@ define <4 x float> @sqrt_v4f32_check_den
 ; AVX-NEXT:    vmulps %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vaddps {{.*}}(%rip), %xmm1, %xmm1
 ; AVX-NEXT:    vmulps %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vcmpneqps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovaps {{.*#+}} xmm2 = [1.175494e-38,1.175494e-38,1.175494e-38,1.175494e-38]
+; AVX-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2