[llvm] r221706 - Use rcpss/rcpps (X86) to speed up reciprocal calcs (PR21385).

Tue Nov 11 12:51:00 PST 2014

Author: spatel
Date: Tue Nov 11 14:51:00 2014
New Revision: 221706

URL: http://llvm.org/viewvc/llvm-project?rev=221706&view=rev
Log:
Use rcpss/rcpps (X86) to speed up reciprocal calcs (PR21385).

This is a first step for generating SSE rcp instructions for reciprocal
calcs when fast-math allows it. This is very similar to the rsqrt optimization
enabled in D5658 ( http://reviews.llvm.org/rL220570 ).

For now, be conservative and only enable this for AMD btver2 where performance
improves significantly both in terms of latency and throughput.

We may never enable this codegen for Intel Core* chips because the divider circuits
are just too fast. On SandyBridge, divss can be as fast as 10 cycles versus the 21
cycle critical path for the rcp + mul + sub + mul + add estimate.

Follow-on patches may allow configuration of the number of Newton-Raphson refinement
steps, add AVX512 support, and enable the optimization for more chips.

More background here: http://llvm.org/bugs/show_bug.cgi?id=21385

Differential Revision: http://reviews.llvm.org/D6175


Added:
    llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
Modified:
    llvm/trunk/lib/Target/X86/X86.td
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/lib/Target/X86/X86Subtarget.h

Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=221706&r1=221705&r2=221706&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Tue Nov 11 14:51:00 2014
@@ -184,6 +184,8 @@ def FeatureSlowIncDec : SubtargetFeature
                                    "INC and DEC instructions are slower than ADD and SUB">;
 def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
                             "Use RSQRT* to optimize square root calculations">;
+def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
+                          "true", "Use RCP* to optimize division calculations">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -350,7 +352,7 @@ def : ProcessorModel<"btver2", BtVer2Mod
                       FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                       FeatureBMI, FeatureF16C, FeatureMOVBE,
                       FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
-                      FeatureUseSqrtEst]>;
+                      FeatureUseSqrtEst, FeatureUseRecipEst]>;
 
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=221706&r1=221705&r2=221706&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Nov 11 14:51:00 2014
@@ -14514,6 +14514,37 @@ SDValue X86TargetLowering::getRsqrtEstim
   return SDValue();
 }
 
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps) const {
+  // FIXME: We should use instruction latency models to calculate the cost of
+  // each potential sequence, but this is very hard to do reliably because
+  // at least Intel's Core* chips have variable timing based on the number of
+  // significant digits in the divisor.
+  if (!Subtarget->useReciprocalEst())
+    return SDValue();
+  
+  EVT VT = Op.getValueType();
+  
+  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // reciprocal estimate with refinement on x86 prior to FMA requires
+  // 15 instructions: convert to single, rcpss, convert back to double, refine
+  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+    // TODO: Expose this as a user-configurable parameter to allow for
+    // speed vs. accuracy flexibility.
+    RefinementSteps = 1;
+    return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+  }
+  return SDValue();
+}
+
 static bool isAllOnes(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   return C && C->isAllOnesValue();

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=221706&r1=221705&r2=221706&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Tue Nov 11 14:51:00 2014
@@ -1031,6 +1031,10 @@ namespace llvm {
     SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
                              unsigned &RefinementSteps,
                              bool &UseOneConstNR) const override;
+
+    /// Use rcp* to speed up fdiv calculations.
+    SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                             unsigned &RefinementSteps) const override;
   };
 
   namespace X86 {

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=221706&r1=221705&r2=221706&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Tue Nov 11 14:51:00 2014
@@ -197,6 +197,11 @@ protected:
   /// substantially higher than normal FP ops like FADD and FMUL.
   bool UseSqrtEst;
 
+  /// Use the RCP* instructions to optimize FP division calculations.
+  /// For this to be profitable, the cost of FDIV must be
+  /// substantially higher than normal FP ops like FADD and FMUL.
+  bool UseReciprocalEst;
+  
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
 
@@ -375,6 +380,7 @@ public:
   bool slowLEA() const { return SlowLEA; }
   bool slowIncDec() const { return SlowIncDec; }
   bool useSqrtEst() const { return UseSqrtEst; }
+  bool useReciprocalEst() const { return UseReciprocalEst; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }

Added: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath.ll?rev=221706&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll (added)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll Tue Nov 11 14:51:00 2014
@@ -0,0 +1,72 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+
+; If the target's divss/divps instructions are substantially
+; slower than rcpss/rcpps with a Newton-Raphson refinement,
+; we should generate the estimate sequence.
+
+; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
+; for details about the accuracy, speed, and implementation
+; differences of x86 reciprocal estimates.
+
+define float @reciprocal_estimate(float %x) #0 {
+  %div = fdiv fast float 1.0, %x
+  ret float %div
+
+; CHECK-LABEL: reciprocal_estimate:
+; CHECK: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate:
+; BTVER2: vrcpss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vsubss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vaddss
+; BTVER2-NEXT: retq
+}
+
+define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v4f32:
+; BTVER2: vrcpps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vsubps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: retq
+}
+
+define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v8f32:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v8f32:
+; BTVER2: vrcpps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vsubps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: retq
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }