[llvm-branch-commits] [llvm] 4d83aba - [DAGCombine] Adding a hook to improve the precision of fsqrt if the input is denormal

QingShan Zhang via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Nov 26 18:17:38 PST 2020


Author: QingShan Zhang
Date: 2020-11-27T02:10:55Z
New Revision: 4d83aba4228ecb7dfefaf10a36a35f7299467819

URL: https://github.com/llvm/llvm-project/commit/4d83aba4228ecb7dfefaf10a36a35f7299467819
DIFF: https://github.com/llvm/llvm-project/commit/4d83aba4228ecb7dfefaf10a36a35f7299467819.diff

LOG: [DAGCombine] Adding a hook to improve the precision of fsqrt if the input is denormal

For now, we will hardcode the result as 0.0 if the input is denormal or 0. That will
have the impact the precision. As the fsqrt added belong to the cold path of the
cmp+branch, it won't impact the performance for normal inputs for PowerPC, but improve
the precision if the input is denormal.

Reviewed By: Spatel

Differential Revision: https://reviews.llvm.org/D80974

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/lib/Target/PowerPC/PPCInstrInfo.td
    llvm/lib/Target/PowerPC/PPCInstrVSX.td
    llvm/test/CodeGen/PowerPC/fma-mutate.ll
    llvm/test/CodeGen/PowerPC/recipest.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 16580a9160b9..4aeefd980d7a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4286,6 +4286,13 @@ class TargetLowering : public TargetLoweringBase {
     return SDValue();
   }
 
+  /// Return a target-dependent result if the input operand is not suitable for
+  /// use with a square root estimate calculation.
+  virtual SDValue getSqrtResultForDenormInput(SDValue Operand,
+                                              SelectionDAG &DAG) const {
+    return DAG.getConstantFP(0.0, SDLoc(Operand), Operand.getValueType());
+  }
+
   //===--------------------------------------------------------------------===//
   // Legalization utility functions
   //

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4ac1743d2d34..1b5debfe602e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22052,8 +22052,6 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
             : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
 
       if (!Reciprocal) {
-        // The estimate is now completely wrong if the input was exactly 0.0 or
-        // possibly a denormal. Force the answer to 0.0 for those cases.
         SDLoc DL(Op);
         EVT CCVT = getSetCCResultType(VT);
         SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
@@ -22077,10 +22075,13 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
             // Test = X == 0.0
             Test = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
         }
-        // Test ? 0.0 : Est
-        Est = DAG.getNode(Test.getValueType().isVector() ? ISD::VSELECT
-                                                         : ISD::SELECT,
-                          DL, VT, Test, FPZero, Est);
+
+        // The estimate is now completely wrong if the input was exactly 0.0 or
+        // possibly a denormal. Force the answer to 0.0 or value provided by
+        // target for those cases.
+        Est = DAG.getNode(
+            Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
+            Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est);
       }
     }
     return Est;

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index cf369f5f12c1..2d8dfb63f19c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1441,6 +1441,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
   case PPCISD::FTSQRT:
     return "PPCISD::FTSQRT";
+  case PPCISD::FSQRT:
+    return "PPCISD::FSQRT";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
@@ -12761,6 +12763,17 @@ SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
                  0);
 }
 
+SDValue
+PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // TODO - add support for v2f64/v4f32
+  EVT VT = Op.getValueType();
+  if (VT != MVT::f64)
+    return TargetLowering::getSqrtResultForDenormInput(Op, DAG);
+
+  return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op);
+}
+
 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                            int Enabled, int &RefinementSteps,
                                            bool &UseOneConstNR,

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 8fad97c618b7..63b59dd91604 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -92,6 +92,9 @@ namespace llvm {
     /// Test instruction for software square root.
     FTSQRT,
 
+    /// Square root instruction.
+    FSQRT,
+
     /// VPERM - The PPC VPERM Instruction.
     ///
     VPERM,
@@ -1287,6 +1290,8 @@ namespace llvm {
                              int &RefinementSteps) const override;
     SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
                              const DenormalMode &Mode) const override;
+    SDValue getSqrtResultForDenormInput(SDValue Operand,
+                                        SelectionDAG &DAG) const override;
     unsigned combineRepeatedFPDivisors() const override;
 
     SDValue

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index de9ae99adac7..1496accf83d8 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -127,6 +127,7 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [
 
 def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
 def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+def PPCfsqrt  : SDNode<"PPCISD::FSQRT",   SDTFPUnaryOp, []>;
 def PPCftsqrt : SDNode<"PPCISD::FTSQRT",  SDT_PPCFtsqrt,[]>;
 
 def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
@@ -2706,6 +2707,8 @@ let Uses = [RM], mayRaiseFPException = 1, hasSideEffects = 0 in {
 }
 }
 
+def : Pat<(PPCfsqrt f64:$frA), (FSQRT $frA)>;
+
 /// Note that FMR is defined as pseudo-ops on the PPC970 because they are
 /// often coalesced away and we don't want the dispatch group builder to think
 /// that they will fill slots (which could cause the load of a LSU reject to

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index b023c0596063..e778ca4be6b5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2463,6 +2463,8 @@ def : Pat<(fneg (PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C)),
 def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, (fneg v4f32:$C)),
           (XVNMADDASP $C, $A, $B)>;
 
+def : Pat<(PPCfsqrt f64:$frA), (XSSQRTDP $frA)>;
+
 def : Pat<(v2f64 (bitconvert v4f32:$A)),
           (COPY_TO_REGCLASS $A, VSRC)>;
 def : Pat<(v2f64 (bitconvert v4i32:$A)),

diff  --git a/llvm/test/CodeGen/PowerPC/fma-mutate.ll b/llvm/test/CodeGen/PowerPC/fma-mutate.ll
index 62cce7362c68..0c85c2457ff5 100644
--- a/llvm/test/CodeGen/PowerPC/fma-mutate.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-mutate.ll
@@ -10,7 +10,6 @@ define double @foo3_fmf(double %a) nounwind {
 ; CHECK-LABEL: foo3_fmf:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xstsqrtdp 0, 1
-; CHECK-NEXT:    xxlxor 0, 0, 0
 ; CHECK-NEXT:    bc 12, 2, .LBB0_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    xsrsqrtedp 0, 1
@@ -25,9 +24,10 @@ define double @foo3_fmf(double %a) nounwind {
 ; CHECK-NEXT:    xsmuldp 1, 1, 0
 ; CHECK-NEXT:    xsmaddadp 3, 1, 0
 ; CHECK-NEXT:    xsmuldp 0, 1, 4
-; CHECK-NEXT:    xsmuldp 0, 0, 3
+; CHECK-NEXT:    xsmuldp 1, 0, 3
+; CHECK-NEXT:    blr
 ; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    fmr 1, 0
+; CHECK-NEXT:    xssqrtdp 1, 1
 ; CHECK-NEXT:    blr
   %r = call reassoc afn ninf double @llvm.sqrt.f64(double %a)
   ret double %r

diff  --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll
index cd8520b35ffa..3d9f2efc32e0 100644
--- a/llvm/test/CodeGen/PowerPC/recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/recipest.ll
@@ -767,14 +767,12 @@ define double @foo3_fmf(double %a) nounwind {
 ; CHECK-P7-NEXT:    fmul 1, 1, 0
 ; CHECK-P7-NEXT:    blr
 ; CHECK-P7-NEXT:  .LBB20_2:
-; CHECK-P7-NEXT:    addis 3, 2, .LCPI20_2 at toc@ha
-; CHECK-P7-NEXT:    lfs 1, .LCPI20_2 at toc@l(3)
+; CHECK-P7-NEXT:    fsqrt 1, 1
 ; CHECK-P7-NEXT:    blr
 ;
 ; CHECK-P8-LABEL: foo3_fmf:
 ; CHECK-P8:       # %bb.0:
 ; CHECK-P8-NEXT:    xstsqrtdp 0, 1
-; CHECK-P8-NEXT:    xxlxor 0, 0, 0
 ; CHECK-P8-NEXT:    bc 12, 2, .LBB20_2
 ; CHECK-P8-NEXT:  # %bb.1:
 ; CHECK-P8-NEXT:    xsrsqrtedp 0, 1
@@ -790,15 +788,15 @@ define double @foo3_fmf(double %a) nounwind {
 ; CHECK-P8-NEXT:    xsmuldp 1, 1, 0
 ; CHECK-P8-NEXT:    xsmaddadp 3, 1, 0
 ; CHECK-P8-NEXT:    xsmuldp 0, 1, 4
-; CHECK-P8-NEXT:    xsmuldp 0, 0, 3
+; CHECK-P8-NEXT:    xsmuldp 1, 0, 3
+; CHECK-P8-NEXT:    blr
 ; CHECK-P8-NEXT:  .LBB20_2:
-; CHECK-P8-NEXT:    fmr 1, 0
+; CHECK-P8-NEXT:    xssqrtdp 1, 1
 ; CHECK-P8-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: foo3_fmf:
 ; CHECK-P9:       # %bb.0:
 ; CHECK-P9-NEXT:    xstsqrtdp 0, 1
-; CHECK-P9-NEXT:    xxlxor 0, 0, 0
 ; CHECK-P9-NEXT:    bc 12, 2, .LBB20_2
 ; CHECK-P9-NEXT:  # %bb.1:
 ; CHECK-P9-NEXT:    xsrsqrtedp 0, 1
@@ -814,9 +812,10 @@ define double @foo3_fmf(double %a) nounwind {
 ; CHECK-P9-NEXT:    xsmuldp 1, 1, 0
 ; CHECK-P9-NEXT:    xsmaddadp 3, 1, 0
 ; CHECK-P9-NEXT:    xsmuldp 0, 1, 2
-; CHECK-P9-NEXT:    xsmuldp 0, 0, 3
+; CHECK-P9-NEXT:    xsmuldp 1, 0, 3
+; CHECK-P9-NEXT:    blr
 ; CHECK-P9-NEXT:  .LBB20_2:
-; CHECK-P9-NEXT:    fmr 1, 0
+; CHECK-P9-NEXT:    xssqrtdp 1, 1
 ; CHECK-P9-NEXT:    blr
   %r = call reassoc ninf afn double @llvm.sqrt.f64(double %a)
   ret double %r
@@ -1028,45 +1027,41 @@ define <4 x float> @hoo3_safe(<4 x float> %a) nounwind {
 define <2 x double> @hoo4_fmf(<2 x double> %a) #1 {
 ; CHECK-P7-LABEL: hoo4_fmf:
 ; CHECK-P7:       # %bb.0:
-; CHECK-P7-NEXT:    addis 3, 2, .LCPI26_2 at toc@ha
 ; CHECK-P7-NEXT:    ftsqrt 0, 1
-; CHECK-P7-NEXT:    fmr 3, 1
-; CHECK-P7-NEXT:    addis 4, 2, .LCPI26_0 at toc@ha
-; CHECK-P7-NEXT:    lfs 0, .LCPI26_2 at toc@l(3)
-; CHECK-P7-NEXT:    addis 3, 2, .LCPI26_1 at toc@ha
-; CHECK-P7-NEXT:    lfs 5, .LCPI26_0 at toc@l(4)
-; CHECK-P7-NEXT:    lfs 4, .LCPI26_1 at toc@l(3)
-; CHECK-P7-NEXT:    fmr 1, 0
-; CHECK-P7-NEXT:    bc 4, 2, .LBB26_3
+; CHECK-P7-NEXT:    addis 3, 2, .LCPI26_0 at toc@ha
+; CHECK-P7-NEXT:    addis 4, 2, .LCPI26_1 at toc@ha
+; CHECK-P7-NEXT:    lfs 3, .LCPI26_0 at toc@l(3)
+; CHECK-P7-NEXT:    lfs 0, .LCPI26_1 at toc@l(4)
+; CHECK-P7-NEXT:    bc 12, 2, .LBB26_3
 ; CHECK-P7-NEXT:  # %bb.1:
+; CHECK-P7-NEXT:    frsqrte 4, 1
+; CHECK-P7-NEXT:    fmul 5, 1, 4
+; CHECK-P7-NEXT:    fmadd 5, 5, 4, 3
+; CHECK-P7-NEXT:    fmul 4, 4, 0
+; CHECK-P7-NEXT:    fmul 4, 4, 5
+; CHECK-P7-NEXT:    fmul 1, 1, 4
+; CHECK-P7-NEXT:    fmadd 4, 1, 4, 3
+; CHECK-P7-NEXT:    fmul 1, 1, 0
+; CHECK-P7-NEXT:    fmul 1, 1, 4
 ; CHECK-P7-NEXT:    ftsqrt 0, 2
 ; CHECK-P7-NEXT:    bc 4, 2, .LBB26_4
 ; CHECK-P7-NEXT:  .LBB26_2:
-; CHECK-P7-NEXT:    fmr 2, 0
+; CHECK-P7-NEXT:    fsqrt 2, 2
 ; CHECK-P7-NEXT:    blr
 ; CHECK-P7-NEXT:  .LBB26_3:
-; CHECK-P7-NEXT:    frsqrte 1, 3
-; CHECK-P7-NEXT:    fmul 6, 3, 1
-; CHECK-P7-NEXT:    fmadd 6, 6, 1, 5
-; CHECK-P7-NEXT:    fmul 1, 1, 4
-; CHECK-P7-NEXT:    fmul 1, 1, 6
-; CHECK-P7-NEXT:    fmul 3, 3, 1
-; CHECK-P7-NEXT:    fmadd 1, 3, 1, 5
-; CHECK-P7-NEXT:    fmul 3, 3, 4
-; CHECK-P7-NEXT:    fmul 1, 3, 1
+; CHECK-P7-NEXT:    fsqrt 1, 1
 ; CHECK-P7-NEXT:    ftsqrt 0, 2
 ; CHECK-P7-NEXT:    bc 12, 2, .LBB26_2
 ; CHECK-P7-NEXT:  .LBB26_4:
-; CHECK-P7-NEXT:    frsqrte 0, 2
-; CHECK-P7-NEXT:    fmul 3, 2, 0
-; CHECK-P7-NEXT:    fmadd 3, 3, 0, 5
-; CHECK-P7-NEXT:    fmul 0, 0, 4
-; CHECK-P7-NEXT:    fmul 0, 0, 3
-; CHECK-P7-NEXT:    fmul 2, 2, 0
-; CHECK-P7-NEXT:    fmadd 0, 2, 0, 5
+; CHECK-P7-NEXT:    frsqrte 4, 2
+; CHECK-P7-NEXT:    fmul 5, 2, 4
+; CHECK-P7-NEXT:    fmadd 5, 5, 4, 3
+; CHECK-P7-NEXT:    fmul 4, 4, 0
+; CHECK-P7-NEXT:    fmul 4, 4, 5
 ; CHECK-P7-NEXT:    fmul 2, 2, 4
+; CHECK-P7-NEXT:    fmadd 3, 2, 4, 3
 ; CHECK-P7-NEXT:    fmul 0, 2, 0
-; CHECK-P7-NEXT:    fmr 2, 0
+; CHECK-P7-NEXT:    fmul 2, 0, 3
 ; CHECK-P7-NEXT:    blr
 ;
 ; CHECK-P8-LABEL: hoo4_fmf:


        


More information about the llvm-branch-commits mailing list