[llvm] r277725 - [X86] Heuristic to selectively build Newton-Raphson SQRT estimation

Thu Aug 4 05:47:29 PDT 2016

Author: n.bozhenov
Date: Thu Aug  4 07:47:28 2016
New Revision: 277725

URL: http://llvm.org/viewvc/llvm-project?rev=277725&view=rev
Log:
[X86] Heuristic to selectively build Newton-Raphson SQRT estimation

On modern Intel processors hardware SQRT in many cases is faster than RSQRT
followed by Newton-Raphson refinement. The patch introduces a simple heuristic
to choose between hardware SQRT instruction and Newton-Raphson software
estimation.

The patch treats scalars and vectors differently. The heuristic is that for
scalars the compiler should optimize for latency while for vectors it should
optimize for throughput. It is based on the assumption that throughput bound
code is likely to be vectorized.

Basically, the patch disables scalar NR for big cores and disables NR completely
for Skylake. Firstly, scalar SQRT has shorter latency than NR code in big cores.
Secondly, vector SQRT has been greatly improved in Skylake and has better
throughput compared to NR.

Differential Revision: https://reviews.llvm.org/D21379


Added:
    llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll
Modified:
    llvm/trunk/include/llvm/Target/TargetLowering.h
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/trunk/lib/Target/X86/X86.td
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/lib/Target/X86/X86Subtarget.cpp
    llvm/trunk/lib/Target/X86/X86Subtarget.h

Modified: llvm/trunk/include/llvm/Target/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetLowering.h?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Target/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/Target/TargetLowering.h Thu Aug  4 07:47:28 2016
@@ -243,9 +243,10 @@ public:
     return true;
   }
 
-  /// Return true if sqrt(x) is as cheap or cheaper than 1 / rsqrt(x)
-  bool isFsqrtCheap() const {
-    return FsqrtIsCheap;
+  /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
+  virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
+    // Default behavior is to replace SQRT(X) with X*RSQRT(X).
+    return false;
   }
 
   /// Returns true if target has indicated at least one type should be bypassed.
@@ -1381,10 +1382,6 @@ protected:
   /// control.
   void setJumpIsExpensive(bool isExpensive = true);
 
-  /// Tells the code generator that fsqrt is cheap, and should not be replaced
-  /// with an alternative sequence of instructions.
-  void setFsqrtIsCheap(bool isCheap = true) { FsqrtIsCheap = isCheap; }
-
   /// Tells the code generator that this target supports floating point
   /// exceptions and cares about preserving floating point exception behavior.
   void setHasFloatingPointExceptions(bool FPExceptions = true) {
@@ -1910,9 +1907,6 @@ private:
   /// combined with "shift" to BitExtract instructions.
   bool HasExtractBitsInsn;
 
-  // Don't expand fsqrt with an approximation based on the inverse sqrt.
-  bool FsqrtIsCheap;
-
   /// Tells the code generator to bypass slow divide or remainder
   /// instructions. For example, BypassSlowDivWidths[32,8] tells the code
   /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Aug  4 07:47:28 2016
@@ -8907,14 +8907,18 @@ SDValue DAGCombiner::visitFREM(SDNode *N
 }
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (TLI.isFsqrtCheap(N0, DAG))
     return SDValue();
 
   // TODO: FSQRT nodes should have flags that propagate to the created nodes.
   // For now, create a Flags object for use with all unsafe math transforms.
   SDNodeFlags Flags;
   Flags.setUnsafeAlgebra(true);
-  return buildSqrtEstimate(N->getOperand(0), &Flags);
+  return buildSqrtEstimate(N0, &Flags);
 }
 
 /// copysign(x, fp_extend(y)) -> copysign(x, y)

Modified: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp Thu Aug  4 07:47:28 2016
@@ -807,7 +807,6 @@ TargetLoweringBase::TargetLoweringBase(c
   SelectIsExpensive = false;
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
-  FsqrtIsCheap = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
   MaskAndBranchFoldingIsLegal = false;

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Thu Aug  4 07:47:28 2016
@@ -446,8 +446,6 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
 
-  setFsqrtIsCheap(true);
-
   // We want to find all load dependencies for long chains of stores to enable
   // merging into very wide vectors. The problem is with vectors with > 4
   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Thu Aug  4 07:47:28 2016
@@ -166,6 +166,9 @@ public:
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
+  bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+    return true;
+  }
   SDValue getRsqrtEstimate(SDValue Operand,
                            DAGCombinerInfo &DCI,
                            unsigned &RefinementSteps,

Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Thu Aug  4 07:47:28 2016
@@ -249,6 +249,19 @@ def FeatureSoftFloat
 def FeatureFastPartialYMMWrite
     : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
                        "true", "Partial writes to YMM registers are fast">;
+// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
+// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
+// vector FSQRT has higher throughput than the corresponding NR code.
+// The idea is that throughput bound code is likely to be vectorized, so for
+// vectorized code we should care about the throughput of SQRT operations.
+// But if the code is scalar that probably means that the code has some kind of
+// dependency and we should care more about reducing the latency.
+def FeatureFastScalarFSQRT
+    : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT",
+                       "true", "Scalar SQRT is fast (disable Newton-Raphson)">;
+def FeatureFastVectorFSQRT
+    : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
+                       "true", "Vector SQRT is fast (disable Newton-Raphson)">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -442,7 +455,8 @@ def SNBFeatures : ProcessorFeatures<[],
   FeaturePCLMUL,
   FeatureXSAVE,
   FeatureXSAVEOPT,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureFastScalarFSQRT
 ]>;
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -500,7 +514,8 @@ def SKLFeatures : ProcessorFeatures<BDWF
   FeatureXSAVEC,
   FeatureXSAVES,
   FeatureSGX,
-  FeatureCLFLUSHOPT
+  FeatureCLFLUSHOPT,
+  FeatureFastVectorFSQRT
 ]>;
 
 // FIXME: define SKL model

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Aug  4 07:47:28 2016
@@ -15081,6 +15081,19 @@ SDValue X86TargetLowering::ConvertCmpIfN
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
+/// Check if replacement of SQRT with RSQRT should be disabled.
+bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // We never want to use both SQRT and RSQRT instructions for the same input.
+  if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
+    return false;
+
+  if (VT.isVector())
+    return Subtarget.hasFastVectorFSQRT();
+  return Subtarget.hasFastScalarFSQRT();
+}
+
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Thu Aug  4 07:47:28 2016
@@ -1219,6 +1219,9 @@ namespace llvm {
     /// Convert a comparison if required by the subtarget.
     SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
 
+    /// Check if replacement of SQRT with RSQRT should be disabled.
+    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+
     /// Use rsqrt* to speed up sqrt calculations.
     SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
                              unsigned &RefinementSteps,

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Thu Aug  4 07:47:28 2016
@@ -282,6 +282,8 @@ void X86Subtarget::initializeEnvironment
   HasCmpxchg16b = false;
   UseLeaForSP = false;
   HasFastPartialYMMWrite = false;
+  HasFastScalarFSQRT = false;
+  HasFastVectorFSQRT = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=277725&r1=277724&r2=277725&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Thu Aug  4 07:47:28 2016
@@ -199,6 +199,14 @@ protected:
   /// of a YMM register without clearing the upper part.
   bool HasFastPartialYMMWrite;
 
+  /// True if hardware SQRTSS instruction is at least as fast (latency) as
+  /// RSQRTSS followed by a Newton-Raphson iteration.
+  bool HasFastScalarFSQRT;
+
+  /// True if hardware SQRTPS/VSQRTPS instructions are at least as fast
+  /// (throughput) as RSQRTPS/VRSQRTPS followed by a Newton-Raphson iteration.
+  bool HasFastVectorFSQRT;
+
   /// True if 8-bit divisions are significantly faster than
   /// 32-bit divisions and should be used when possible.
   bool HasSlowDivide32;
@@ -434,6 +442,8 @@ public:
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
   bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+  bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
+  bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }

Added: llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll?rev=277725&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll (added)
+++ llvm/trunk/test/CodeGen/X86/sqrt-fastmath-tune.ll Thu Aug  4 07:47:28 2016
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=nehalem     | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=sandybridge | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=broadwell   | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mcpu=skylake     | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-ACC
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=+fast-scalar-fsqrt,-fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-ACC --check-prefix=VECTOR-EST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O2 -mattr=-fast-scalar-fsqrt,+fast-vector-fsqrt | FileCheck %s --check-prefix=SCALAR-EST --check-prefix=VECTOR-ACC
+
+declare float @llvm.sqrt.f32(float) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
+
+define float @foo_x1(float %f) #0 {
+; SCALAR-EST-LABEL: foo_x1:
+; SCALAR-EST:       # BB#0:
+; SCALAR-EST-NEXT:    rsqrtss %xmm0
+; SCALAR-EST:         retq
+;
+; SCALAR-ACC-LABEL: foo_x1:
+; SCALAR-ACC:       # BB#0:
+; SCALAR-ACC-NEXT:    {{^ *v?sqrtss %xmm0}}
+; SCALAR-ACC-NEXT:    retq
+  %call = tail call float @llvm.sqrt.f32(float %f) #1
+  ret float %call
+}
+
+define <4 x float> @foo_x4(<4 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x4:
+; VECTOR-EST:       # BB#0:
+; VECTOR-EST-NEXT:    rsqrtps %xmm0
+; VECTOR-EST:         retq
+;
+; VECTOR-ACC-LABEL: foo_x4:
+; VECTOR-ACC:       # BB#0:
+; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %xmm0}}
+; VECTOR-ACC-NEXT:    retq
+  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
+  ret <4 x float> %call
+}
+
+define <8 x float> @foo_x8(<8 x float> %f) #0 {
+; VECTOR-EST-LABEL: foo_x8:
+; VECTOR-EST:       # BB#0:
+; VECTOR-EST-NEXT:    rsqrtps
+; VECTOR-EST:         retq
+;
+; VECTOR-ACC-LABEL: foo_x8:
+; VECTOR-ACC:       # BB#0:
+; VECTOR-ACC-NEXT:    {{^ *v?sqrtps %[xy]mm0}}
+; VECTOR-ACC-NOT:     rsqrt
+; VECTOR-ACC:         retq
+  %call = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #1
+  ret <8 x float> %call
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }