[llvm] r239001 - make reciprocal estimate code generation more flexible by adding command-line options (3rd try)

Fri Jun 5 14:34:37 PDT 2015

This breaks -DBUILD_SHARED_LIBS=ON for me in an llvm+clang build:

[10/92] Linking CXX shared library lib/libclangCodeGen.3.7.0svn.dylib
FAILED: : && /usr/bin/c++  -fPIC -fvisibility-inlines-hidden -Wall -W -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wcovered-switch-default -std=c++11 -fcolor-diagnostics -fno-common -Woverloaded-virtual -fno-strict-aliasing -Wno-nested-anon-types -g  -dynamiclib -Wl,-headerpad_max_install_names  -o lib/libclangCodeGen.3.7.0svn.dylib ...

Undefined symbols for architecture x86_64:
  "llvm::TargetRecip::TargetRecip()", referenced from:
      llvm::TargetOptions::TargetOptions() in BackendUtil.cpp.o
ld: symbol(s) not found for architecture x86_64
clang: error: linker command failed with exit code 1 (use -v to see invocation)

- Matthias

> On Jun 3, 2015, at 6:32 PM, Sanjay Patel <spatel at rotateright.com> wrote:
> 
> Author: spatel
> Date: Wed Jun  3 20:32:35 2015
> New Revision: 239001
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=239001&view=rev
> Log:
> make reciprocal estimate code generation more flexible by adding command-line options (3rd try)
> 
> The first try (r238051) to land this was reverted due to ExecutionEngine build failure;
> that was hopefully addressed by r238788.
> 
> The second try (r238842) to land this was reverted due to BUILD_SHARED_LIBS failure;
> that was hopefully addressed by r238953.
> 
> This patch adds a TargetRecip class for processing many recip codegen possibilities.
> The class is intended to handle both command-line options to llc as well
> as options passed in from a front-end such as clang with the -mrecip option.
> 
> The x86 backend is updated to use the new functionality.
> Only -mcpu=btver2 with -ffast-math should see a functional change from this patch.
> All other x86 CPUs continue to *not* use reciprocal estimates by default with -ffast-math.
> 
> Differential Revision: http://reviews.llvm.org/D8982
> 
> 
> Added:
>    llvm/trunk/include/llvm/Target/TargetRecip.h
>    llvm/trunk/lib/Target/TargetRecip.cpp
> Modified:
>    llvm/trunk/include/llvm/CodeGen/CommandFlags.h
>    llvm/trunk/include/llvm/Target/TargetOptions.h
>    llvm/trunk/lib/Target/CMakeLists.txt
>    llvm/trunk/lib/Target/X86/X86.td
>    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>    llvm/trunk/lib/Target/X86/X86Subtarget.cpp
>    llvm/trunk/lib/Target/X86/X86Subtarget.h
>    llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
>    llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
>    llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll
> 
> Modified: llvm/trunk/include/llvm/CodeGen/CommandFlags.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/CommandFlags.h?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/CodeGen/CommandFlags.h (original)
> +++ llvm/trunk/include/llvm/CodeGen/CommandFlags.h Wed Jun  3 20:32:35 2015
> @@ -24,6 +24,7 @@
> #include "llvm/Support/Host.h"
> #include "llvm/Target/TargetMachine.h"
> #include "llvm/Target/TargetOptions.h"
> +#include "llvm/Target/TargetRecip.h"
> #include <string>
> using namespace llvm;
> 
> @@ -152,6 +153,12 @@ FuseFPOps("fp-contract",
>                          "Only fuse FP ops when the result won't be effected."),
>               clEnumValEnd));
> 
> +cl::list<std::string>
> +ReciprocalOps("recip",
> +  cl::CommaSeparated,
> +  cl::desc("Choose reciprocal operation types and parameters."),
> +  cl::value_desc("all,none,default,divf,!vec-sqrtd,vec-divd:0,sqrt:9..."));
> +
> cl::opt<bool>
> DontPlaceZerosInBSS("nozero-initialized-in-bss",
>               cl::desc("Don't place zero-initialized symbols into bss section"),
> @@ -230,6 +237,7 @@ static inline TargetOptions InitTargetOp
>   TargetOptions Options;
>   Options.LessPreciseFPMADOption = EnableFPMAD;
>   Options.AllowFPOpFusion = FuseFPOps;
> +  Options.Reciprocals = TargetRecip(ReciprocalOps);
>   Options.UnsafeFPMath = EnableUnsafeFPMath;
>   Options.NoInfsFPMath = EnableNoInfsFPMath;
>   Options.NoNaNsFPMath = EnableNoNaNsFPMath;
> 
> Modified: llvm/trunk/include/llvm/Target/TargetOptions.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetOptions.h?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetOptions.h (original)
> +++ llvm/trunk/include/llvm/Target/TargetOptions.h Wed Jun  3 20:32:35 2015
> @@ -15,6 +15,7 @@
> #ifndef LLVM_TARGET_TARGETOPTIONS_H
> #define LLVM_TARGET_TARGETOPTIONS_H
> 
> +#include "llvm/Target/TargetRecip.h"
> #include "llvm/MC/MCTargetOptions.h"
> #include <string>
> 
> @@ -72,7 +73,8 @@ namespace llvm {
>           CompressDebugSections(false), FunctionSections(false),
>           DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
>           TrapFuncName(), FloatABIType(FloatABI::Default),
> -          AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single),
> +          AllowFPOpFusion(FPOpFusion::Standard), Reciprocals(TargetRecip()),
> +          JTType(JumpTable::Single),
>           ThreadModel(ThreadModel::POSIX) {}
> 
>     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
> @@ -206,6 +208,9 @@ namespace llvm {
>     /// the value of this option.
>     FPOpFusion::FPOpFusionMode AllowFPOpFusion;
> 
> +    /// This class encapsulates options for reciprocal-estimate code generation.
> +    TargetRecip Reciprocals;
> +    
>     /// JTType - This flag specifies the type of jump-instruction table to
>     /// create for functions that have the jumptable attribute.
>     JumpTable::JumpTableType JTType;
> @@ -240,6 +245,7 @@ inline bool operator==(const TargetOptio
>     ARE_EQUAL(TrapFuncName) &&
>     ARE_EQUAL(FloatABIType) &&
>     ARE_EQUAL(AllowFPOpFusion) &&
> +    ARE_EQUAL(Reciprocals) &&
>     ARE_EQUAL(JTType) &&
>     ARE_EQUAL(ThreadModel) &&
>     ARE_EQUAL(MCOptions);
> 
> Added: llvm/trunk/include/llvm/Target/TargetRecip.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetRecip.h?rev=239001&view=auto
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetRecip.h (added)
> +++ llvm/trunk/include/llvm/Target/TargetRecip.h Wed Jun  3 20:32:35 2015
> @@ -0,0 +1,73 @@
> +//===--------------------- llvm/Target/TargetRecip.h ------------*- C++ -*-===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This class is used to customize machine-specific reciprocal estimate code
> +// generation in a target-independent way.
> +// If a target does not support operations in this specification, then code
> +// generation will default to using supported operations.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#ifndef LLVM_TARGET_TARGETRECIP_H
> +#define LLVM_TARGET_TARGETRECIP_H
> +
> +#include "llvm/ADT/StringRef.h"
> +#include <vector>
> +#include <string>
> +#include <map>
> +
> +namespace llvm {
> +
> +struct TargetRecip {
> +public:
> +  TargetRecip();
> +
> +  /// Initialize all or part of the operations from command-line options or
> +  /// a front end.
> +  TargetRecip(const std::vector<std::string> &Args);
> +  
> +  /// Set whether a particular reciprocal operation is enabled and how many
> +  /// refinement steps are needed when using it. Use "all" to set enablement
> +  /// and refinement steps for all operations.
> +  void setDefaults(const StringRef &Key, bool Enable, unsigned RefSteps);
> +
> +  /// Return true if the reciprocal operation has been enabled by default or
> +  /// from the command-line. Return false if the operation has been disabled
> +  /// by default or from the command-line.
> +  bool isEnabled(const StringRef &Key) const;
> +
> +  /// Return the number of iterations necessary to refine the
> +  /// the result of a machine instruction for the given reciprocal operation.
> +  unsigned getRefinementSteps(const StringRef &Key) const;
> +
> +  bool operator==(const TargetRecip &Other) const;
> +
> +private:
> +  enum {
> +    Uninitialized = -1
> +  };
> +  
> +  struct RecipParams {
> +    int8_t Enabled;
> +    int8_t RefinementSteps;
> +    
> +    RecipParams() : Enabled(Uninitialized), RefinementSteps(Uninitialized) {}
> +  };
> +  
> +  std::map<StringRef, RecipParams> RecipMap;
> +  typedef std::map<StringRef, RecipParams>::iterator RecipIter;
> +  typedef std::map<StringRef, RecipParams>::const_iterator ConstRecipIter;
> +
> +  bool parseGlobalParams(const std::string &Arg);
> +  void parseIndividualParams(const std::vector<std::string> &Args);
> +};
> +
> +} // End llvm namespace
> +
> +#endif
> 
> Modified: llvm/trunk/lib/Target/CMakeLists.txt
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/CMakeLists.txt?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/CMakeLists.txt (original)
> +++ llvm/trunk/lib/Target/CMakeLists.txt Wed Jun  3 20:32:35 2015
> @@ -6,6 +6,7 @@ add_llvm_library(LLVMTarget
>   TargetLoweringObjectFile.cpp
>   TargetMachine.cpp
>   TargetMachineC.cpp
> +  TargetRecip.cpp
>   TargetSubtargetInfo.cpp
> 
>   ADDITIONAL_HEADER_DIRS
> 
> Added: llvm/trunk/lib/Target/TargetRecip.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/TargetRecip.cpp?rev=239001&view=auto
> ==============================================================================
> --- llvm/trunk/lib/Target/TargetRecip.cpp (added)
> +++ llvm/trunk/lib/Target/TargetRecip.cpp Wed Jun  3 20:32:35 2015
> @@ -0,0 +1,225 @@
> +//===-------------------------- TargetRecip.cpp ---------------------------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This class is used to customize machine-specific reciprocal estimate code
> +// generation in a target-independent way.
> +// If a target does not support operations in this specification, then code
> +// generation will default to using supported operations.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#include "llvm/ADT/StringRef.h"
> +#include "llvm/ADT/STLExtras.h"
> +#include "llvm/Support/ErrorHandling.h"
> +#include "llvm/Target/TargetRecip.h"
> +#include <map>
> +
> +using namespace llvm;
> +
> +// These are the names of the individual reciprocal operations. These are
> +// the key strings for queries and command-line inputs.
> +// In addition, the command-line interface recognizes the global parameters
> +// "all", "none", and "default".
> +static const char *RecipOps[] = {
> +  "divd",
> +  "divf",
> +  "vec-divd",
> +  "vec-divf",
> +  "sqrtd",
> +  "sqrtf",
> +  "vec-sqrtd",
> +  "vec-sqrtf",
> +};
> +
> +// The uninitialized state is needed for the enabled settings and refinement
> +// steps because custom settings may arrive via the command-line before target
> +// defaults are set.
> +TargetRecip::TargetRecip() {
> +  unsigned NumStrings = llvm::array_lengthof(RecipOps);
> +  for (unsigned i = 0; i < NumStrings; ++i)
> +    RecipMap.insert(std::make_pair(RecipOps[i], RecipParams()));
> +}
> +
> +static bool parseRefinementStep(const StringRef &In, size_t &Position,
> +                                uint8_t &Value) {
> +  const char RefStepToken = ':';
> +  Position = In.find(RefStepToken);
> +  if (Position == StringRef::npos)
> +    return false;
> +
> +  StringRef RefStepString = In.substr(Position + 1);
> +  // Allow exactly one numeric character for the additional refinement
> +  // step parameter.
> +  if (RefStepString.size() == 1) {
> +    char RefStepChar = RefStepString[0];
> +    if (RefStepChar >= '0' && RefStepChar <= '9') {
> +      Value = RefStepChar - '0';
> +      return true;
> +    }
> +  }
> +  report_fatal_error("Invalid refinement step for -recip.");
> +}
> +
> +bool TargetRecip::parseGlobalParams(const std::string &Arg) {
> +  StringRef ArgSub = Arg;
> +
> +  // Look for an optional setting of the number of refinement steps needed
> +  // for this type of reciprocal operation.
> +  size_t RefPos;
> +  uint8_t RefSteps;
> +  StringRef RefStepString;
> +  if (parseRefinementStep(ArgSub, RefPos, RefSteps)) {
> +    // Split the string for further processing.
> +    RefStepString = ArgSub.substr(RefPos + 1);
> +    ArgSub = ArgSub.substr(0, RefPos);
> +  }
> +  bool Enable;
> +  bool UseDefaults;
> +  if (ArgSub == "all") {
> +    UseDefaults = false;
> +    Enable = true;
> +  } else if (ArgSub == "none") {
> +    UseDefaults = false;
> +    Enable = false;
> +  } else if (ArgSub == "default") {
> +    UseDefaults = true;
> +  } else {
> +    // Any other string is invalid or an individual setting.
> +    return false;
> +  }
> +
> +  // All enable values will be initialized to target defaults if 'default' was
> +  // specified.
> +  if (!UseDefaults)
> +    for (auto &KV : RecipMap)
> +      KV.second.Enabled = Enable;
> +
> +  // Custom refinement count was specified with all, none, or default.
> +  if (!RefStepString.empty())
> +    for (auto &KV : RecipMap)
> +      KV.second.RefinementSteps = RefSteps;
> +  
> +  return true;
> +}
> +
> +void TargetRecip::parseIndividualParams(const std::vector<std::string> &Args) {
> +  static const char DisabledPrefix = '!';
> +  unsigned NumArgs = Args.size();
> +
> +  for (unsigned i = 0; i != NumArgs; ++i) {
> +    StringRef Val = Args[i];
> +    
> +    bool IsDisabled = Val[0] == DisabledPrefix;
> +    // Ignore the disablement token for string matching.
> +    if (IsDisabled)
> +      Val = Val.substr(1);
> +    
> +    size_t RefPos;
> +    uint8_t RefSteps;
> +    StringRef RefStepString;
> +    if (parseRefinementStep(Val, RefPos, RefSteps)) {
> +      // Split the string for further processing.
> +      RefStepString = Val.substr(RefPos + 1);
> +      Val = Val.substr(0, RefPos);
> +    }
> +
> +    RecipIter Iter = RecipMap.find(Val);
> +    if (Iter == RecipMap.end()) {
> +      // Try again specifying float suffix.
> +      Iter = RecipMap.find(Val.str() + 'f');
> +      if (Iter == RecipMap.end()) {
> +        Iter = RecipMap.find(Val.str() + 'd');
> +        assert(Iter == RecipMap.end() && "Float entry missing from map");
> +        report_fatal_error("Invalid option for -recip.");
> +      }
> +      
> +      // The option was specified without a float or double suffix.
> +      if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) {
> +        // Make sure that the double entry was not already specified.
> +        // The float entry will be checked below.
> +        report_fatal_error("Duplicate option for -recip.");
> +      }
> +    }
> +    
> +    if (Iter->second.Enabled != Uninitialized)
> +      report_fatal_error("Duplicate option for -recip.");
> +    
> +    // Mark the matched option as found. Do not allow duplicate specifiers.
> +    Iter->second.Enabled = !IsDisabled;
> +    if (!RefStepString.empty())
> +      Iter->second.RefinementSteps = RefSteps;
> +    
> +    // If the precision was not specified, the double entry is also initialized.
> +    if (Val.back() != 'f' && Val.back() != 'd') {
> +      RecipMap[Val.str() + 'd'].Enabled = !IsDisabled;
> +      if (!RefStepString.empty())
> +        RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps;
> +    }
> +  }
> +}
> +
> +TargetRecip::TargetRecip(const std::vector<std::string> &Args) :
> +  TargetRecip() {
> +  unsigned NumArgs = Args.size();
> +
> +  // Check if "all", "default", or "none" was specified.
> +  if (NumArgs == 1 && parseGlobalParams(Args[0]))
> +    return;
> + 
> +  parseIndividualParams(Args);
> +}
> +
> +bool TargetRecip::isEnabled(const StringRef &Key) const {
> +  ConstRecipIter Iter = RecipMap.find(Key);
> +  assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
> +  assert(Iter->second.Enabled != Uninitialized &&
> +         "Enablement setting was not initialized");
> +  return Iter->second.Enabled;
> +}
> +
> +unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
> +  ConstRecipIter Iter = RecipMap.find(Key);
> +  assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
> +  assert(Iter->second.RefinementSteps != Uninitialized &&
> +         "Refinement step setting was not initialized");
> +  return Iter->second.RefinementSteps;
> +}
> +
> +/// Custom settings (previously initialized values) override target defaults.
> +void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
> +                              unsigned RefSteps) {
> +  if (Key == "all") {
> +    for (auto &KV : RecipMap) {
> +      RecipParams &RP = KV.second;
> +      if (RP.Enabled == Uninitialized)
> +        RP.Enabled = Enable;
> +      if (RP.RefinementSteps == Uninitialized)
> +        RP.RefinementSteps = RefSteps;
> +    }
> +  } else {
> +    RecipParams &RP = RecipMap[Key];
> +    if (RP.Enabled == Uninitialized)
> +      RP.Enabled = Enable;
> +    if (RP.RefinementSteps == Uninitialized)
> +      RP.RefinementSteps = RefSteps;
> +  }
> +}
> +
> +bool TargetRecip::operator==(const TargetRecip &Other) const {
> +  for (const auto &KV : RecipMap) {
> +    const StringRef &Op = KV.first;
> +    const RecipParams &RP = KV.second;
> +    const RecipParams &OtherRP = Other.RecipMap.find(Op)->second;
> +    if (RP.RefinementSteps != OtherRP.RefinementSteps)
> +      return false;
> +    if (RP.Enabled != OtherRP.Enabled)
> +      return false;
> +  }
> +  return true;
> +}
> 
> Modified: llvm/trunk/lib/Target/X86/X86.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86.td (original)
> +++ llvm/trunk/lib/Target/X86/X86.td Wed Jun  3 20:32:35 2015
> @@ -190,10 +190,6 @@ def FeatureSlowLEA : SubtargetFeature<"s
>                                    "LEA instruction with certain arguments is slow">;
> def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
>                                    "INC and DEC instructions are slower than ADD and SUB">;
> -def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
> -                            "Use RSQRT* to optimize square root calculations">;
> -def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
> -                          "true", "Use RCP* to optimize division calculations">;
> def FeatureSoftFloat
>     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
>                        "Use software floating point features.">;
> @@ -446,7 +442,7 @@ def : ProcessorModel<"btver2", BtVer2Mod
>                       FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
>                       FeatureBMI, FeatureF16C, FeatureMOVBE,
>                       FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
> -                      FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
> +                      FeatureSlowSHLD]>;
> 
> // TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
> 
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jun  3 20:32:35 2015
> @@ -67,12 +67,6 @@ static cl::opt<bool> ExperimentalVectorW
>              "rather than promotion."),
>     cl::Hidden);
> 
> -static cl::opt<int> ReciprocalEstimateRefinementSteps(
> -    "x86-recip-refinement-steps", cl::init(1),
> -    cl::desc("Specify the number of Newton-Raphson iterations applied to the "
> -             "result of the hardware reciprocal estimate instruction."),
> -    cl::NotHidden);
> -
> // Forward declarations.
> static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
>                        SDValue V2);
> @@ -13006,29 +13000,31 @@ SDValue X86TargetLowering::getRsqrtEstim
>                                             DAGCombinerInfo &DCI,
>                                             unsigned &RefinementSteps,
>                                             bool &UseOneConstNR) const {
> -  // FIXME: We should use instruction latency models to calculate the cost of
> -  // each potential sequence, but this is very hard to do reliably because
> -  // at least Intel's Core* chips have variable timing based on the number of
> -  // significant digits in the divisor and/or sqrt operand.
> -  if (!Subtarget->useSqrtEst())
> -    return SDValue();
> -
>   EVT VT = Op.getValueType();
> +  const char *RecipOp;
> 
> -  // SSE1 has rsqrtss and rsqrtps.
> +  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
>   // TODO: Add support for AVX512 (v16f32).
>   // It is likely not profitable to do this for f64 because a double-precision
>   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
>   // instructions: convert to single, rsqrtss, convert back to double, refine
>   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
>   // along with FMA, this could be a throughput win.
> -  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
> -      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
> -    RefinementSteps = 1;
> -    UseOneConstNR = false;
> -    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
> -  }
> -  return SDValue();
> +  if (VT == MVT::f32 && Subtarget->hasSSE1())
> +    RecipOp = "sqrtf";
> +  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
> +           (VT == MVT::v8f32 && Subtarget->hasAVX()))
> +    RecipOp = "vec-sqrtf";
> +  else
> +    return SDValue();
> +  
> +  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
> +  if (!Recips.isEnabled(RecipOp))
> +    return SDValue();
> +  
> +  RefinementSteps = Recips.getRefinementSteps(RecipOp);
> +  UseOneConstNR = false;
> +  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
> }
> 
> /// The minimum architected relative accuracy is 2^-12. We need one
> @@ -13036,15 +13032,9 @@ SDValue X86TargetLowering::getRsqrtEstim
> SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
>                                             DAGCombinerInfo &DCI,
>                                             unsigned &RefinementSteps) const {
> -  // FIXME: We should use instruction latency models to calculate the cost of
> -  // each potential sequence, but this is very hard to do reliably because
> -  // at least Intel's Core* chips have variable timing based on the number of
> -  // significant digits in the divisor.
> -  if (!Subtarget->useReciprocalEst())
> -    return SDValue();
> -
>   EVT VT = Op.getValueType();
> -
> +  const char *RecipOp;
> +  
>   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
>   // TODO: Add support for AVX512 (v16f32).
>   // It is likely not profitable to do this for f64 because a double-precision
> @@ -13052,12 +13042,20 @@ SDValue X86TargetLowering::getRecipEstim
>   // 15 instructions: convert to single, rcpss, convert back to double, refine
>   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
>   // along with FMA, this could be a throughput win.
> -  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
> -      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
> -    RefinementSteps = ReciprocalEstimateRefinementSteps;
> -    return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
> -  }
> -  return SDValue();
> +  if (VT == MVT::f32 && Subtarget->hasSSE1())
> +    RecipOp = "divf";
> +  else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
> +           (VT == MVT::v8f32 && Subtarget->hasAVX()))
> +    RecipOp = "vec-divf";
> +  else
> +    return SDValue();
> +  
> +  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
> +  if (!Recips.isEnabled(RecipOp))
> +    return SDValue();
> +
> +  RefinementSteps = Recips.getRefinementSteps(RecipOp);
> +  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
> }
> 
> /// If we have at least two divisions that use the same divisor, convert to
> 
> Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Wed Jun  3 20:32:35 2015
> @@ -274,8 +274,6 @@ void X86Subtarget::initializeEnvironment
>   LEAUsesAG = false;
>   SlowLEA = false;
>   SlowIncDec = false;
> -  UseSqrtEst = false;
> -  UseReciprocalEst = false;
>   stackAlignment = 4;
>   // FIXME: this is a known good value for Yonah. How about others?
>   MaxInlineSizeThreshold = 128;
> 
> Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
> +++ llvm/trunk/lib/Target/X86/X86Subtarget.h Wed Jun  3 20:32:35 2015
> @@ -190,16 +190,6 @@ protected:
>   /// True if INC and DEC instructions are slow when writing to flags
>   bool SlowIncDec;
> 
> -  /// Use the RSQRT* instructions to optimize square root calculations.
> -  /// For this to be profitable, the cost of FSQRT and FDIV must be
> -  /// substantially higher than normal FP ops like FADD and FMUL.
> -  bool UseSqrtEst;
> -
> -  /// Use the RCP* instructions to optimize FP division calculations.
> -  /// For this to be profitable, the cost of FDIV must be
> -  /// substantially higher than normal FP ops like FADD and FMUL.
> -  bool UseReciprocalEst;
> -
>   /// Processor has AVX-512 PreFetch Instructions
>   bool HasPFI;
> 
> @@ -380,8 +370,6 @@ public:
>   bool LEAusesAG() const { return LEAUsesAG; }
>   bool slowLEA() const { return SlowLEA; }
>   bool slowIncDec() const { return SlowIncDec; }
> -  bool useSqrtEst() const { return UseSqrtEst; }
> -  bool useReciprocalEst() const { return UseReciprocalEst; }
>   bool hasCDI() const { return HasCDI; }
>   bool hasPFI() const { return HasPFI; }
>   bool hasERI() const { return HasERI; }
> 
> Modified: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetMachine.cpp?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp Wed Jun  3 20:32:35 2015
> @@ -105,6 +105,13 @@ X86TargetMachine::X86TargetMachine(const
>   if (Subtarget.isTargetWin64())
>     this->Options.TrapUnreachable = true;
> 
> +  // TODO: By default, all reciprocal estimate operations are off because
> +  // that matches the behavior before TargetRecip was added (except for btver2
> +  // which used subtarget features to enable this type of codegen).
> +  // We should change this to match GCC behavior where everything but
> +  // scalar division estimates are turned on by default with -ffast-math.
> +  this->Options.Reciprocals.setDefaults("all", false, 1);
> +
>   initAsmInfo();
> }
> 
> 
> Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath.ll?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll Wed Jun  3 20:32:35 2015
> @@ -1,6 +1,6 @@
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
> -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP
> -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
> +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP
> +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE
> 
> ; If the target's divss/divps instructions are substantially
> ; slower than rcpss/rcpps with a Newton-Raphson refinement,
> 
> Modified: llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll?rev=239001&r1=239000&r2=239001&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/sqrt-fastmath.ll Wed Jun  3 20:32:35 2015
> @@ -1,5 +1,5 @@
> ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
> -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE
> +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE
> 
> declare double @__sqrt_finite(double) #0
> declare float @__sqrtf_finite(float) #0
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits