[llvm] r178617 - Use PPC reciprocal estimates with Newton iteration in fast-math mode

Wed Apr 3 10:47:02 PDT 2013

----- Original Message -----
> From: "Bill Schmidt" <wschmidt at linux.vnet.ibm.com>
> To: "Hal Finkel" <hfinkel at anl.gov>
> Cc: llvm-commits at cs.uiuc.edu
> Sent: Wednesday, April 3, 2013 9:34:42 AM
> Subject: Re: [llvm] r178617 - Use PPC reciprocal estimates with Newton iteration in	fast-math mode
> 
> Hi Hal,
> 
> Thanks for tackling this!  A few small notes below.
> 
> On Wed, 2013-04-03 at 04:01 +0000, Hal Finkel wrote:
> > Author: hfinkel
> > Date: Tue Apr  2 23:01:11 2013
> > New Revision: 178617
> > 
> > URL: http://llvm.org/viewvc/llvm-project?rev=178617&view=rev
> > Log:
> > Use PPC reciprocal estimates with Newton iteration in fast-math
> > mode
> > 
> > When unsafe FP math operations are enabled, we can use the fre[s]
> > and
> > frsqrte[s] instructions, which generate reciprocal (sqrt)
> > estimates, together
> > with some Newton iteration, in order to quickly generate
> > floating-point
> > division and sqrt results. All of these instructions are separately
> > optional,
> > and so each has its own feature flag (except for the Altivec
> > instructions,
> > which are covered under the existing Altivec flag). Doing this is
> > not only
> > faster than using the IEEE-compliant fdiv/fsqrt instructions, but
> > allows these
> > computations to be pipelined with other computations in order to
> > hide their
> > overall latency.
> > 
> > I've also added a couple of missing fnmsub patterns which turned
> > out to be
> > missing (but are necessary for good code generation of the Newton
> > iterations).
> > Altivec needs a similar fix, but that will probably be more
> > complicated because
> > fneg is expanded for Altivec's v4f32.
> 
> Could you please add a FIXME comment in the code for this?  I'm
> afraid
> it will get lost otherwise.

r178658.

> 
> > 
> > Added:
> >     llvm/trunk/test/CodeGen/PowerPC/recipest.ll
> > Modified:
> >     llvm/trunk/lib/Target/PowerPC/PPC.td
> >     llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
> >     llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
> >     llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td
> >     llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td
> >     llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp
> >     llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h
> > 
> > Modified: llvm/trunk/lib/Target/PowerPC/PPC.td
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPC.td?rev=178617&r1=178616&r2=178617&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/PowerPC/PPC.td (original)
> > +++ llvm/trunk/lib/Target/PowerPC/PPC.td Tue Apr  2 23:01:11 2013
> > @@ -57,6 +57,16 @@ def FeatureMFOCRF    : SubtargetFeature<
> >                                          "Enable the MFOCRF
> >                                          instruction">;
> >  def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT",
> >  "true",
> >                                          "Enable the fsqrt
> >                                          instruction">;
> > +def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
> > +                                        "Enable the fre
> > instruction">;
> > +def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
> > +                                        "Enable the fres
> > instruction">;
> > +def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE",
> > "true",
> > +                                        "Enable the frsqrte
> > instruction">;
> > +def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES",
> > "true",
> > +                                        "Enable the frsqrtes
> > instruction">;
> > +def FeatureRecipPrec : SubtargetFeature<"recipprec",
> > "HasRecipPrec", "true",
> > +                              "Assume higher precision reciprocal
> > estimates">;
> >  def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX",
> >  "true",
> >                                          "Enable the stfiwx
> >                                          instruction">;
> >  def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX",
> >  "true",
> > @@ -101,30 +111,46 @@ include "PPCInstrInfo.td"
> > 
> >  def : Processor<"generic", G3Itineraries, [Directive32]>;
> >  def : Processor<"440", PPC440Itineraries, [Directive440,
> >  FeatureISEL,
> > +                                           FeatureFRES,
> > FeatureFRSQRTE,
> >                                             FeatureBookE]>;
> >  def : Processor<"450", PPC440Itineraries, [Directive440,
> >  FeatureISEL,
> > +                                           FeatureFRES,
> > FeatureFRSQRTE,
> >                                             FeatureBookE]>;
> >  def : Processor<"601", G3Itineraries, [Directive601]>;
> >  def : Processor<"602", G3Itineraries, [Directive602]>;
> > -def : Processor<"603", G3Itineraries, [Directive603]>;
> > -def : Processor<"603e", G3Itineraries, [Directive603]>;
> > -def : Processor<"603ev", G3Itineraries, [Directive603]>;
> > -def : Processor<"604", G3Itineraries, [Directive604]>;
> > -def : Processor<"604e", G3Itineraries, [Directive604]>;
> > -def : Processor<"620", G3Itineraries, [Directive620]>;
> > -def : Processor<"750", G4Itineraries, [Directive750]>;
> > -def : Processor<"g3", G3Itineraries, [Directive750]>;
> > -def : Processor<"7400", G4Itineraries, [Directive7400,
> > FeatureAltivec]>;
> > -def : Processor<"g4", G4Itineraries, [Directive7400,
> > FeatureAltivec]>;
> > -def : Processor<"7450", G4PlusItineraries, [Directive7400,
> > FeatureAltivec]>;
> > -def : Processor<"g4+", G4PlusItineraries, [Directive7400,
> > FeatureAltivec]>;
> > +def : Processor<"603", G3Itineraries, [Directive603,
> > +                                       FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"603e", G3Itineraries, [Directive603,
> > +                                        FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"603ev", G3Itineraries, [Directive603,
> > +                                         FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"604", G3Itineraries, [Directive604,
> > +                                       FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"604e", G3Itineraries, [Directive604,
> > +                                        FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"620", G3Itineraries, [Directive620,
> > +                                       FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"750", G4Itineraries, [Directive750,
> > +                                       FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"g3", G3Itineraries, [Directive750,
> > +                                      FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"7400", G4Itineraries, [Directive7400,
> > FeatureAltivec,
> > +                                        FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"g4", G4Itineraries, [Directive7400,
> > FeatureAltivec,
> > +                                      FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"7450", G4PlusItineraries, [Directive7400,
> > FeatureAltivec,
> > +                                            FeatureFRES,
> > FeatureFRSQRTE]>;
> > +def : Processor<"g4+", G4PlusItineraries, [Directive7400,
> > FeatureAltivec,
> > +                                           FeatureFRES,
> > FeatureFRSQRTE]>;
> >  def : Processor<"970", G5Itineraries,
> >                    [Directive970, FeatureAltivec,
> > -                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
> > +                   FeatureMFOCRF, FeatureFSqrt,
> > +                   FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
> >                     Feature64Bit /*, Feature64BitRegs */]>;
> >  def : Processor<"g5", G5Itineraries,
> >                    [Directive970, FeatureAltivec,
> >                     FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
> > +                   FeatureFRES, FeatureFRSQRTE,
> >                     Feature64Bit /*, Feature64BitRegs */]>;
> >  def : ProcessorModel<"e500mc", PPCE500mcModel,
> >                    [DirectiveE500mc, FeatureMFOCRF,
> > @@ -134,43 +160,57 @@ def : ProcessorModel<"e5500", PPCE5500Mo
> >                     FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
> >  def : Processor<"a2", PPCA2Itineraries,
> >                    [DirectiveA2, FeatureBookE, FeatureMFOCRF,
> > -                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
> > +                   FeatureFSqrt, FeatureFRE, FeatureFRES,
> > +                   FeatureFRSQRTE, FeatureFRSQRTES,
> > FeatureRecipPrec,
> > +                   FeatureSTFIWX, FeatureLFIWAX,
> >                     FeatureFPRND, FeatureFPCVT, FeatureISEL,
> >                     FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
> >                 /*, Feature64BitRegs */]>;
> >  def : Processor<"a2q", PPCA2Itineraries,
> >                    [DirectiveA2, FeatureBookE, FeatureMFOCRF,
> > -                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
> > +                   FeatureFSqrt, FeatureFRE, FeatureFRES,
> > +                   FeatureFRSQRTE, FeatureFRSQRTES,
> > FeatureRecipPrec,
> > +                   FeatureSTFIWX, FeatureLFIWAX,
> >                     FeatureFPRND, FeatureFPCVT, FeatureISEL,
> >                     FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
> >                 /*, Feature64BitRegs */, FeatureQPX]>;
> >  def : Processor<"pwr3", G5Itineraries,
> > -                  [DirectivePwr3, FeatureAltivec, FeatureMFOCRF,
> > +                  [DirectivePwr3, FeatureAltivec,
> > +                   FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
> >                     FeatureSTFIWX, Feature64Bit]>;
> >  def : Processor<"pwr4", G5Itineraries,
> >                    [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
> > -                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
> > +                   FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
> > +                   FeatureSTFIWX, Feature64Bit]>;
> >  def : Processor<"pwr5", G5Itineraries,
> >                    [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
> > -                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
> > +                   FeatureFSqrt, FeatureFRE, FeatureFRES,
> > +                   FeatureFRSQRTE, FeatureFRSQRTES,
> > +                   FeatureSTFIWX, Feature64Bit]>;
> >  def : Processor<"pwr5x", G5Itineraries,
> >                    [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
> > -                   FeatureFSqrt, FeatureSTFIWX, FeatureFPRND,
> > -                   Feature64Bit]>;
> > +                   FeatureFSqrt, FeatureFRE, FeatureFRES,
> > +                   FeatureFRSQRTE, FeatureFRSQRTES,
> > +                   FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
> >  def : Processor<"pwr6", G5Itineraries,
> >                    [DirectivePwr6, FeatureAltivec,
> > -                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
> > -                   FeatureLFIWAX, FeatureFPRND, Feature64Bit
> > -               /*, Feature64BitRegs */]>;
> > +                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
> > +                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
> > +                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
> > +                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs
> > */]>;
> >  def : Processor<"pwr6x", G5Itineraries,
> >                    [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
> > -                   FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
> > +                   FeatureFSqrt, FeatureFRE, FeatureFRES,
> > +                   FeatureFRSQRTE, FeatureFRSQRTES,
> > FeatureRecipPrec,
> > +                   FeatureSTFIWX, FeatureLFIWAX,
> >                     FeatureFPRND, Feature64Bit]>;
> >  def : Processor<"pwr7", G5Itineraries,
> >                    [DirectivePwr7, FeatureAltivec,
> > -                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
> > -                   FeatureLFIWAX, FeatureFPRND, FeatureFPCVT,
> > -                   FeatureISEL, FeaturePOPCNTD, FeatureLDBRX,
> > +                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
> > +                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
> > +                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
> > +                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
> > +                   FeaturePOPCNTD, FeatureLDBRX,
> >                     Feature64Bit /*, Feature64BitRegs */]>;
> >  def : Processor<"ppc", G3Itineraries, [Directive32]>;
> >  def : Processor<"ppc64", G5Itineraries,
> > 
> > Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp?rev=178617&r1=178616&r2=178617&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp (original)
> > +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp Tue Apr  2
> > 23:01:11 2013
> > @@ -150,10 +150,15 @@ PPCTargetLowering::PPCTargetLowering(PPC
> >    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
> > 
> >    // If we're enabling GP optimizations, use hardware square root
> > -  if (!Subtarget->hasFSQRT()) {
> > +  if (!Subtarget->hasFSQRT() &&
> > +      !(TM.Options.UnsafeFPMath &&
> > +        Subtarget->hasFRSQRTE() && Subtarget->hasFRE()))
> >      setOperationAction(ISD::FSQRT, MVT::f64, Expand);
> > +
> > +  if (!Subtarget->hasFSQRT() &&
> > +      !(TM.Options.UnsafeFPMath &&
> > +        Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
> >      setOperationAction(ISD::FSQRT, MVT::f32, Expand);
> > -  }
> > 
> >    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
> >    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
> > @@ -469,6 +474,12 @@ PPCTargetLowering::PPCTargetLowering(PPC
> > 
> >      setOperationAction(ISD::MUL, MVT::v4f32, Legal);
> >      setOperationAction(ISD::FMA, MVT::v4f32, Legal);
> > +
> > +    if (TM.Options.UnsafeFPMath) {
> > +      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
> > +      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
> > +    }
> > +
> >      setOperationAction(ISD::MUL, MVT::v4i32, Custom);
> >      setOperationAction(ISD::MUL, MVT::v8i16, Custom);
> >      setOperationAction(ISD::MUL, MVT::v16i8, Custom);
> > @@ -519,6 +530,12 @@ PPCTargetLowering::PPCTargetLowering(PPC
> >    setTargetDAGCombine(ISD::BR_CC);
> >    setTargetDAGCombine(ISD::BSWAP);
> > 
> > +  // Use reciprocal estimates.
> > +  if (TM.Options.UnsafeFPMath) {
> > +    setTargetDAGCombine(ISD::FDIV);
> > +    setTargetDAGCombine(ISD::FSQRT);
> > +  }
> > +
> >    // Darwin long double math library functions have $LDBL128
> >    appended.
> >    if (Subtarget->isDarwin()) {
> >      setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
> > @@ -590,6 +607,8 @@ const char *PPCTargetLowering::getTarget
> >    case PPCISD::FCFID:           return "PPCISD::FCFID";
> >    case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
> >    case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
> > +  case PPCISD::FRE:             return "PPCISD::FRE";
> > +  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
> >    case PPCISD::STFIWX:          return "PPCISD::STFIWX";
> >    case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
> >    case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
> > @@ -6658,6 +6677,153 @@ PPCTargetLowering::EmitInstrWithCustomIn
> >  // Target Optimization Hooks
> >  //===----------------------------------------------------------------------===//
> > 
> > +SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N,
> > +                                             DAGCombinerInfo &DCI,
> > +                                             bool UseOperand)
> > const {
> 
> Please document UseOperand in header commentary for the function.
>  It's
> not rocket science, but will help readability.

I changed the functions to always compute using node provided as the input value, so this parameter is no longer necessary.

> 
> > +  if (DCI.isAfterLegalizeVectorOps())
> > +    return SDValue();
> > +
> > +  if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRES())
> > ||
> > +      (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRE())
> >  ||
> > +      (N->getValueType(0) == MVT::v4f32 &&
> > PPCSubTarget.hasAltivec())) {
> 
> I'd suggest factoring N->getValueType(0) into a variable, since it
> appears about a dozen times.  It will make the code tighter and
> easier
> to read.

Good idea!

> 
> > +
> > +    // Newton iteration for a function: F(X) is X_{i+1} = X_i -
> > F(X_i)/F'(X_i)
> > +    // For the reciprocal, we need to find the zero of the
> > function:
> > +    //   F(X) = A X - 1 [which has a zero at X = 1/A]
> > +    //     =>
> > +    //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this
> > second form
> > +    //     does not require additional intermediate precision]
> > +
> > +    // Convergence is quadratic, so we essentially double the
> > number of digits
> > +    // correct after every iteration. The minimum architected
> > relative
> > +    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE
> > float has
> > +    // 23 digits and double has 52 digits.
> > +    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
> > +    if (N->getValueType(0).getScalarType() == MVT::f64)
> > +      ++Iterations;
> > +
> > +    SelectionDAG &DAG = DCI.DAG;
> > +    DebugLoc dl = N->getDebugLoc();
> > +
> > +    SDValue FPOne =
> > +      DAG.getConstantFP(1.0, N->getValueType(0).getScalarType());
> > +    if (N->getValueType(0).isVector()) {
> > +      assert(N->getValueType(0).getVectorNumElements() == 4 &&
> > +             "Unknown vector type");
> > +      FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl,
> > N->getValueType(0),
> > +                          FPOne, FPOne, FPOne, FPOne);
> > +    }
> > +
> > +    SDValue Est = DAG.getNode(PPCISD::FRE, dl,
> > +                              N->getValueType(0),
> > +                              UseOperand ? N->getOperand(1) :
> > +                                           SDValue(N, 0));
> > +    DCI.AddToWorklist(Est.getNode());
> > +
> > +    // Newton iterations: Est = Est + Est (1 - Arg * Est)
> > +    for (int i = 0; i < Iterations; ++i) {
> > +      SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
> > +                                   N->getValueType(0),
> > +                                   UseOperand ? N->getOperand(1) :
> > +                                                SDValue(N, 0),
> > +                                   Est);
> > +      DCI.AddToWorklist(NewEst.getNode());
> > +
> > +      NewEst = DAG.getNode(ISD::FSUB, dl,
> > +                           N->getValueType(0), FPOne, NewEst);
> > +      DCI.AddToWorklist(NewEst.getNode());
> > +
> > +      NewEst = DAG.getNode(ISD::FMUL, dl,
> > +                           N->getValueType(0), Est, NewEst);
> > +      DCI.AddToWorklist(NewEst.getNode());
> > +
> > +      Est = DAG.getNode(ISD::FADD, dl,
> > +                        N->getValueType(0), Est, NewEst);
> > +      DCI.AddToWorklist(Est.getNode());
> > +    }
> > +
> > +    return Est;
> > +  }
> > +
> > +  return SDValue();
> > +}
> > +
> > +SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N,
> > +                                             DAGCombinerInfo &DCI)
> > const {
> > +  if (DCI.isAfterLegalizeVectorOps())
> > +    return SDValue();
> > +
> > +  if ((N->getValueType(0) == MVT::f32 &&
> > PPCSubTarget.hasFRSQRTES()) ||
> > +      (N->getValueType(0) == MVT::f64 &&
> > PPCSubTarget.hasFRSQRTE())  ||
> > +      (N->getValueType(0) == MVT::v4f32 &&
> > PPCSubTarget.hasAltivec())) {
> 
> Again, suggest factoring this.
> 
> > +
> > +    // Newton iteration for a function: F(X) is X_{i+1} = X_i -
> > F(X_i)/F'(X_i)
> > +    // For the reciprocal sqrt, we need to find the zero of the
> > function:
> > +    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
> > +    //     =>
> > +    //   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
> > +    // As a result, we precompute A/2 prior to the iteration loop.
> > +
> > +    // Convergence is quadratic, so we essentially double the
> > number of digits
> > +    // correct after every iteration. The minimum architected
> > relative
> > +    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE
> > float has
> > +    // 23 digits and double has 52 digits.
> > +    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
> > +    if (N->getValueType(0).getScalarType() == MVT::f64)
> > +      ++Iterations;
> > +
> > +    SelectionDAG &DAG = DCI.DAG;
> > +    DebugLoc dl = N->getDebugLoc();
> > +
> > +    SDValue FPThreeHalfs =
> > +      DAG.getConstantFP(1.5, N->getValueType(0).getScalarType());
> > +    if (N->getValueType(0).isVector()) {
> > +      assert(N->getValueType(0).getVectorNumElements() == 4 &&
> > +             "Unknown vector type");
> > +      FPThreeHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl,
> > N->getValueType(0),
> > +                                 FPThreeHalfs, FPThreeHalfs,
> > +                                 FPThreeHalfs, FPThreeHalfs);
> > +    }
> 
> Halves, please.  Ain't no such word as "halfs."  ;)

Fixed. r178672.

> 
> Nice work on the patch!

Thanks for the review!

 -Hal

> 
> Bill
> 
> > +
> > +    SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl,
> > +                              N->getValueType(0),
> > N->getOperand(0));
> > +    DCI.AddToWorklist(Est.getNode());
> > +
> > +    // We now need 0.5*Arg which we can write as (1.5*Arg - Arg)
> > so that
> > +    // this entire sequence requires only one FP constant.
> > +    SDValue HalfArg = DAG.getNode(ISD::FMUL, dl,
> > N->getValueType(0),
> > +                                  FPThreeHalfs, N->getOperand(0));
> > +    DCI.AddToWorklist(HalfArg.getNode());
> > +
> > +    HalfArg = DAG.getNode(ISD::FSUB, dl, N->getValueType(0),
> > +                          HalfArg, N->getOperand(0));
> > +    DCI.AddToWorklist(HalfArg.getNode());
> > +
> > +    // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
> > +    for (int i = 0; i < Iterations; ++i) {
> > +      SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
> > +                                   N->getValueType(0), Est, Est);
> > +      DCI.AddToWorklist(NewEst.getNode());
> > +
> > +      NewEst = DAG.getNode(ISD::FMUL, dl,
> > +                           N->getValueType(0), HalfArg, NewEst);
> > +      DCI.AddToWorklist(NewEst.getNode());
> > +
> > +      NewEst = DAG.getNode(ISD::FSUB, dl,
> > +                           N->getValueType(0), FPThreeHalfs,
> > NewEst);
> > +      DCI.AddToWorklist(NewEst.getNode());
> > +
> > +      Est = DAG.getNode(ISD::FMUL, dl,
> > +                        N->getValueType(0), Est, NewEst);
> > +      DCI.AddToWorklist(Est.getNode());
> > +    }
> > +
> > +    return Est;
> > +  }
> > +
> > +  return SDValue();
> > +}
> > +
> >  SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
> >                                               DAGCombinerInfo &DCI)
> >                                               const {
> >    const TargetMachine &TM = getTargetMachine();
> > @@ -6684,7 +6850,44 @@ SDValue PPCTargetLowering::PerformDAGCom
> >          return N->getOperand(0);
> >      }
> >      break;
> > +  case ISD::FDIV: {
> > +    assert(TM.Options.UnsafeFPMath &&
> > +           "Reciprocal estimates require UnsafeFPMath");
> > +
> > +    if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
> > +      SDValue RV =
> > DAGCombineFastRecipFSQRT(N->getOperand(1).getNode(), DCI);
> > +      if (RV.getNode() != 0) {
> > +        DCI.AddToWorklist(RV.getNode());
> > +        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
> > +                           N->getOperand(0), RV);
> > +      }
> > +    }
> > +
> > +    SDValue RV = DAGCombineFastRecip(N, DCI);
> > +    if (RV.getNode() != 0) {
> > +      DCI.AddToWorklist(RV.getNode());
> > +      return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
> > +                         N->getOperand(0), RV);
> > +    }
> > +
> > +    }
> > +    break;
> > +  case ISD::FSQRT: {
> > +    assert(TM.Options.UnsafeFPMath &&
> > +           "Reciprocal estimates require UnsafeFPMath");
> > +
> > +    // Compute this as 1/(1/sqrt(X)), which is the reciprocal of
> > the
> > +    // reciprocal sqrt.
> > +    SDValue RV = DAGCombineFastRecipFSQRT(N, DCI);
> > +    if (RV.getNode() != 0) {
> > +      DCI.AddToWorklist(RV.getNode());
> > +      RV = DAGCombineFastRecip(RV.getNode(), DCI, false);
> > +      if (RV.getNode() != 0)
> > +        return RV;
> > +    }
> > 
> > +    }
> > +    break;
> >    case ISD::SINT_TO_FP:
> >      if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
> >        if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
> > 
> > Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h?rev=178617&r1=178616&r2=178617&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h (original)
> > +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h Tue Apr  2
> > 23:01:11 2013
> > @@ -49,6 +49,9 @@ namespace llvm {
> >        /// unsigned integers.
> >        FCTIDUZ, FCTIWUZ,
> > 
> > +      /// Reciprocal estimate instructions (unary FP ops).
> > +      FRE, FRSQRTE,
> > +
> >        // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP
> >        instructions, taking
> >        // three v4f32 operands and producing a v4f32 result.
> >        VMADDFP, VNMSUBFP,
> > @@ -620,6 +623,10 @@ namespace llvm {
> > 
> >      SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG)
> >      const;
> >      SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG)
> >      const;
> > +
> > +    SDValue DAGCombineFastRecip(SDNode *N, DAGCombinerInfo &DCI,
> > +                                bool UseOperand = true) const;
> > +    SDValue DAGCombineFastRecipFSQRT(SDNode *N, DAGCombinerInfo
> > &DCI) const;
> >    };
> >  }
> > 
> > 
> > Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td?rev=178617&r1=178616&r2=178617&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td (original)
> > +++ llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td Tue Apr  2
> > 23:01:11 2013
> > @@ -795,6 +795,9 @@ def : Pat<(int_ppc_altivec_vnmsubfp v4f3
> >  def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC),
> >            (VPERM $vA, $vB, $vC)>;
> > 
> > +def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
> > +def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
> > +
> >  // Vector shifts
> >  def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
> >            (v16i8 (VSLB $vA, $vB))>;
> > 
> > Modified: llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td?rev=178617&r1=178616&r2=178617&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td (original)
> > +++ llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td Tue Apr  2
> > 23:01:11 2013
> > @@ -62,6 +62,9 @@ def SDT_PPCTC_ret : SDTypeProfile<0, 2,
> >  // PowerPC specific DAG Nodes.
> >  //
> > 
> > +def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
> > +def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
> > +
> >  def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
> >  def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
> >  def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
> > @@ -1223,8 +1226,21 @@ def FNEGS  : XForm_26<63, 40, (outs F4RC
> >  def FNEGD  : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB),
> >                        "fneg $frD, $frB", FPGeneral,
> >                        [(set f64:$frD, (fneg f64:$frB))]>;
> > +
> > +// Reciprocal estimates.
> > +def FRE      : XForm_26<63, 24, (outs F8RC:$frD), (ins F8RC:$frB),
> > +                        "fre $frD, $frB", FPGeneral,
> > +                        [(set f64:$frD, (PPCfre f64:$frB))]>;
> > +def FRES     : XForm_26<59, 24, (outs F4RC:$frD), (ins F4RC:$frB),
> > +                        "fres $frD, $frB", FPGeneral,
> > +                        [(set f32:$frD, (PPCfre f32:$frB))]>;
> > +def FRSQRTE  : XForm_26<63, 26, (outs F8RC:$frD), (ins F8RC:$frB),
> > +                        "frsqrte $frD, $frB", FPGeneral,
> > +                        [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
> > +def FRSQRTES : XForm_26<59, 26, (outs F4RC:$frD), (ins F4RC:$frB),
> > +                        "frsqrtes $frD, $frB", FPGeneral,
> > +                        [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
> >  }
> > -
> > 
> >  // XL-Form instructions.  condition register logical ops.
> >  //
> > @@ -1687,5 +1703,15 @@ def : Pat<(membarrier (i32 imm /*ll*/),
> > 
> >  def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
> > 
> > +// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
> > +def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
> > +          (FNMSUB $A, $C, $B)>;
> > +def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
> > +          (FNMSUB $A, $C, $B)>;
> > +def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
> > +          (FNMSUBS $A, $C, $B)>;
> > +def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
> > +          (FNMSUBS $A, $C, $B)>;
> > +
> >  include "PPCInstrAltivec.td"
> >  include "PPCInstr64Bit.td"
> > 
> > Modified: llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp?rev=178617&r1=178616&r2=178617&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp (original)
> > +++ llvm/trunk/lib/Target/PowerPC/PPCSubtarget.cpp Tue Apr  2
> > 23:01:11 2013
> > @@ -38,6 +38,11 @@ PPCSubtarget::PPCSubtarget(const std::st
> >    , HasAltivec(false)
> >    , HasQPX(false)
> >    , HasFSQRT(false)
> > +  , HasFRE(false)
> > +  , HasFRES(false)
> > +  , HasFRSQRTE(false)
> > +  , HasFRSQRTES(false)
> > +  , HasRecipPrec(false)
> >    , HasSTFIWX(false)
> >    , HasLFIWAX(false)
> >    , HasFPRND(false)
> > 
> > Modified: llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h?rev=178617&r1=178616&r2=178617&view=diff
> > ==============================================================================
> > --- llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h (original)
> > +++ llvm/trunk/lib/Target/PowerPC/PPCSubtarget.h Tue Apr  2
> > 23:01:11 2013
> > @@ -77,6 +77,8 @@ protected:
> >    bool HasAltivec;
> >    bool HasQPX;
> >    bool HasFSQRT;
> > +  bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
> > +  bool HasRecipPrec;
> >    bool HasSTFIWX;
> >    bool HasLFIWAX;
> >    bool HasFPRND;
> > @@ -159,6 +161,11 @@ public:
> > 
> >    // Specific obvious features.
> >    bool hasFSQRT() const { return HasFSQRT; }
> > +  bool hasFRE() const { return HasFRE; }
> > +  bool hasFRES() const { return HasFRES; }
> > +  bool hasFRSQRTE() const { return HasFRSQRTE; }
> > +  bool hasFRSQRTES() const { return HasFRSQRTES; }
> > +  bool hasRecipPrec() const { return HasRecipPrec; }
> >    bool hasSTFIWX() const { return HasSTFIWX; }
> >    bool hasLFIWAX() const { return HasLFIWAX; }
> >    bool hasFPRND() const { return HasFPRND; }
> > 
> > Added: llvm/trunk/test/CodeGen/PowerPC/recipest.ll
> > URL:
> > http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/recipest.ll?rev=178617&view=auto
> > ==============================================================================
> > --- llvm/trunk/test/CodeGen/PowerPC/recipest.ll (added)
> > +++ llvm/trunk/test/CodeGen/PowerPC/recipest.ll Tue Apr  2 23:01:11
> > 2013
> > @@ -0,0 +1,178 @@
> > +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7
> > -enable-unsafe-fp-math | FileCheck %s
> > +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 |
> > FileCheck -check-prefix=CHECK-SAFE %s
> > +target datalayout =
> > "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
> > +target triple = "powerpc64-unknown-linux-gnu"
> > +
> > +declare double @llvm.sqrt.f64(double)
> > +declare float @llvm.sqrt.f32(float)
> > +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
> > +
> > +define double @foo(double %a, double %b) nounwind {
> > +entry:
> > +  %x = call double @llvm.sqrt.f64(double %b)
> > +  %r = fdiv double %a, %x
> > +  ret double %r
> > +
> > +; CHECK: @foo
> > +; CHECK: frsqrte
> > +; CHECK: fnmsub
> > +; CHECK: fmul
> > +; CHECK: fmadd
> > +; CHECK: fmul
> > +; CHECK: fmul
> > +; CHECK: fmadd
> > +; CHECK: fmul
> > +; CHECK: fmul
> > +; CHECK: blr
> > +
> > +; CHECK-SAFE: @foo
> > +; CHECK-SAFE: fsqrt
> > +; CHECK-SAFE: fdiv
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define float @goo(float %a, float %b) nounwind {
> > +entry:
> > +  %x = call float @llvm.sqrt.f32(float %b)
> > +  %r = fdiv float %a, %x
> > +  ret float %r
> > +
> > +; CHECK: @goo
> > +; CHECK: frsqrtes
> > +; CHECK: fnmsubs
> > +; CHECK: fmuls
> > +; CHECK: fmadds
> > +; CHECK: fmuls
> > +; CHECK: fmuls
> > +; CHECK: blr
> > +
> > +; CHECK-SAFE: @goo
> > +; CHECK-SAFE: fsqrts
> > +; CHECK-SAFE: fdivs
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind {
> > +entry:
> > +  %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
> > +  %r = fdiv <4 x float> %a, %x
> > +  ret <4 x float> %r
> > +
> > +; CHECK: @hoo
> > +; CHECK: vrsqrtefp
> > +
> > +; CHECK-SAFE: @hoo
> > +; CHECK-SAFE-NOT: vrsqrtefp
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define double @foo2(double %a, double %b) nounwind {
> > +entry:
> > +  %r = fdiv double %a, %b
> > +  ret double %r
> > +
> > +; CHECK: @foo2
> > +; CHECK: fre
> > +; CHECK: fnmsub
> > +; CHECK: fmadd
> > +; CHECK: fnmsub
> > +; CHECK: fmadd
> > +; CHECK: fmul
> > +; CHECK: blr
> > +
> > +; CHECK-SAFE: @foo2
> > +; CHECK-SAFE: fdiv
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define float @goo2(float %a, float %b) nounwind {
> > +entry:
> > +  %r = fdiv float %a, %b
> > +  ret float %r
> > +
> > +; CHECK: @goo2
> > +; CHECK: fres
> > +; CHECK: fnmsubs
> > +; CHECK: fmadds
> > +; CHECK: fmuls
> > +; CHECK: blr
> > +
> > +; CHECK-SAFE: @goo2
> > +; CHECK-SAFE: fdivs
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define <4 x float> @hoo2(<4 x float> %a, <4 x float> %b) nounwind
> > {
> > +entry:
> > +  %r = fdiv <4 x float> %a, %b
> > +  ret <4 x float> %r
> > +
> > +; CHECK: @hoo2
> > +; CHECK: vrefp
> > +
> > +; CHECK-SAFE: @hoo2
> > +; CHECK-SAFE-NOT: vrefp
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define double @foo3(double %a) nounwind {
> > +entry:
> > +  %r = call double @llvm.sqrt.f64(double %a)
> > +  ret double %r
> > +
> > +; CHECK: @foo3
> > +; CHECK: frsqrte
> > +; CHECK: fnmsub
> > +; CHECK: fmul
> > +; CHECK: fmadd
> > +; CHECK: fmul
> > +; CHECK: fmul
> > +; CHECK: fmadd
> > +; CHECK: fmul
> > +; CHECK: fre
> > +; CHECK: fnmsub
> > +; CHECK: fmadd
> > +; CHECK: fnmsub
> > +; CHECK: fmadd
> > +; CHECK: blr
> > +
> > +; CHECK-SAFE: @foo3
> > +; CHECK-SAFE: fsqrt
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define float @goo3(float %a) nounwind {
> > +entry:
> > +  %r = call float @llvm.sqrt.f32(float %a)
> > +  ret float %r
> > +
> > +; CHECK: @goo3
> > +; CHECK: frsqrtes
> > +; CHECK: fnmsubs
> > +; CHECK: fmuls
> > +; CHECK: fmadds
> > +; CHECK: fmuls
> > +; CHECK: fres
> > +; CHECK: fnmsubs
> > +; CHECK: fmadds
> > +; CHECK: blr
> > +
> > +; CHECK-SAFE: @goo3
> > +; CHECK-SAFE: fsqrts
> > +; CHECK-SAFE: blr
> > +}
> > +
> > +define <4 x float> @hoo3(<4 x float> %a) nounwind {
> > +entry:
> > +  %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
> > +  ret <4 x float> %r
> > +
> > +; CHECK: @hoo3
> > +; CHECK: vrsqrtefp
> > +; CHECK: vrefp
> > +
> > +; CHECK-SAFE: @hoo3
> > +; CHECK-SAFE-NOT: vrsqrtefp
> > +; CHECK-SAFE: blr
> > +}
> > +
> > 
> > 
> > _______________________________________________
> > llvm-commits mailing list
> > llvm-commits at cs.uiuc.edu
> > http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> > 
> 
>