[llvm] r276051 - AMDGPU: Change fdiv lowering based on !fpmath metadata

Mon Aug 8 13:33:22 PDT 2016

Ping.

There was some posts-commit discussion here. Is this still a merge candidate?

On Mon, Jul 25, 2016 at 10:56 AM, Hans Wennborg <hans at chromium.org> wrote:
> sgtm if Tom agrees.
>
> Cheers,
> Hans
>
> On Mon, Jul 25, 2016 at 10:37 AM, Matt Arsenault via llvm-commits
> <llvm-commits at lists.llvm.org> wrote:
>> Hi,
>>
>> This should go to the release branch to prevent performance regressions
>>
>> -Matt
>>
>>> On Jul 19, 2016, at 16:16, Matt Arsenault via llvm-commits <llvm-commits at lists.llvm.org> wrote:
>>>
>>> Author: arsenm
>>> Date: Tue Jul 19 18:16:53 2016
>>> New Revision: 276051
>>>
>>> URL: http://llvm.org/viewvc/llvm-project?rev=276051&view=rev
>>> Log:
>>> AMDGPU: Change fdiv lowering based on !fpmath metadata
>>>
>>> If 2.5 ulp is acceptable, denormals are not required, and
>>> isn't a reciprocal which will already be handled, replace
>>> with a faster fdiv.
>>>
>>> Simplify the lowering tests by using per function
>>> subtarget features.
>>>
>>> Added:
>>>    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
>>> Modified:
>>>    llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
>>>    llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
>>>    llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
>>>    llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
>>>    llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
>>>    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
>>>    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
>>>    llvm/trunk/lib/Target/AMDGPU/SIIntrinsics.td
>>>    llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
>>>    llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Tue Jul 19 18:16:53 2016
>>> @@ -20,6 +20,7 @@ class AMDGPUInstrPrinter;
>>> class AMDGPUSubtarget;
>>> class AMDGPUTargetMachine;
>>> class FunctionPass;
>>> +class GCNTargetMachine;
>>> struct MachineSchedContext;
>>> class MCAsmInfo;
>>> class raw_ostream;
>>> @@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass(
>>> FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
>>> FunctionPass *createSIDebuggerInsertNopsPass();
>>> FunctionPass *createSIInsertWaitsPass();
>>> -FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr);
>>> +FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
>>>
>>> ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
>>>
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Tue Jul 19 18:16:53 2016
>>> @@ -14,7 +14,9 @@
>>> //===----------------------------------------------------------------------===//
>>>
>>> #include "AMDGPU.h"
>>> +#include "AMDGPUIntrinsicInfo.h"
>>> #include "AMDGPUSubtarget.h"
>>> +#include "AMDGPUTargetMachine.h"
>>>
>>> #include "llvm/Analysis/DivergenceAnalysis.h"
>>> #include "llvm/CodeGen/Passes.h"
>>> @@ -30,15 +32,28 @@ using namespace llvm;
>>> namespace {
>>>
>>> class AMDGPUCodeGenPrepare : public FunctionPass,
>>> -                             public InstVisitor<AMDGPUCodeGenPrepare> {
>>> +                             public InstVisitor<AMDGPUCodeGenPrepare, bool> {
>>> +  const GCNTargetMachine *TM;
>>> +  const SISubtarget *ST;
>>>   DivergenceAnalysis *DA;
>>> -  const TargetMachine *TM;
>>> +  Module *Mod;
>>> +  bool HasUnsafeFPMath;
>>>
>>> public:
>>>   static char ID;
>>>   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
>>>     FunctionPass(ID),
>>> -    TM(TM) { }
>>> +    TM(static_cast<const GCNTargetMachine *>(TM)),
>>> +    ST(nullptr),
>>> +    DA(nullptr),
>>> +    Mod(nullptr),
>>> +    HasUnsafeFPMath(false) { }
>>> +
>>> +  bool visitFDiv(BinaryOperator &I);
>>> +
>>> +  bool visitInstruction(Instruction &I) {
>>> +    return false;
>>> +  }
>>>
>>>   bool doInitialization(Module &M) override;
>>>   bool runOnFunction(Function &F) override;
>>> @@ -55,7 +70,92 @@ public:
>>>
>>> } // End anonymous namespace
>>>
>>> +static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
>>> +  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
>>> +  if (!CNum)
>>> +    return false;
>>> +
>>> +  // Reciprocal f32 is handled separately without denormals.
>>> +  return UnsafeDiv && CNum->isExactlyValue(+1.0);
>>> +}
>>> +
>>> +// Insert an intrinsic for fast fdiv for safe math situations where we can
>>> +// reduce precision. Leave fdiv for situations where the generic node is
>>> +// expected to be optimized.
>>> +bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
>>> +  Type *Ty = FDiv.getType();
>>> +
>>> +  // TODO: Handle half
>>> +  if (!Ty->getScalarType()->isFloatTy())
>>> +    return false;
>>> +
>>> +  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
>>> +  if (!FPMath)
>>> +    return false;
>>> +
>>> +  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
>>> +  float ULP = FPOp->getFPAccuracy();
>>> +  if (ULP < 2.5f)
>>> +    return false;
>>> +
>>> +  FastMathFlags FMF = FPOp->getFastMathFlags();
>>> +  bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
>>> +                                      FMF.allowReciprocal();
>>> +  if (ST->hasFP32Denormals() && !UnsafeDiv)
>>> +    return false;
>>> +
>>> +  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
>>> +  Builder.setFastMathFlags(FMF);
>>> +  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
>>> +
>>> +  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
>>> +  Function *Decl
>>> +    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
>>> +
>>> +  Value *Num = FDiv.getOperand(0);
>>> +  Value *Den = FDiv.getOperand(1);
>>> +
>>> +  Value *NewFDiv = nullptr;
>>> +
>>> +  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
>>> +    NewFDiv = UndefValue::get(VT);
>>> +
>>> +    // FIXME: Doesn't do the right thing for cases where the vector is partially
>>> +    // constant. This works when the scalarizer pass is run first.
>>> +    for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
>>> +      Value *NumEltI = Builder.CreateExtractElement(Num, I);
>>> +      Value *DenEltI = Builder.CreateExtractElement(Den, I);
>>> +      Value *NewElt;
>>> +
>>> +      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
>>> +        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
>>> +      } else {
>>> +        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
>>> +      }
>>> +
>>> +      NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
>>> +    }
>>> +  } else {
>>> +    if (!shouldKeepFDivF32(Num, UnsafeDiv))
>>> +      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
>>> +  }
>>> +
>>> +  if (NewFDiv) {
>>> +    FDiv.replaceAllUsesWith(NewFDiv);
>>> +    NewFDiv->takeName(&FDiv);
>>> +    FDiv.eraseFromParent();
>>> +  }
>>> +
>>> +  return true;
>>> +}
>>> +
>>> +static bool hasUnsafeFPMath(const Function &F) {
>>> +  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
>>> +  return Attr.getValueAsString() == "true";
>>> +}
>>> +
>>> bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
>>> +  Mod = &M;
>>>   return false;
>>> }
>>>
>>> @@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction
>>>   if (!TM || skipFunction(F))
>>>     return false;
>>>
>>> +  ST = &TM->getSubtarget<SISubtarget>(F);
>>>   DA = &getAnalysis<DivergenceAnalysis>();
>>> -  visit(F);
>>> +  HasUnsafeFPMath = hasUnsafeFPMath(F);
>>>
>>> -  return true;
>>> +  bool MadeChange = false;
>>> +
>>> +  for (BasicBlock &BB : F) {
>>> +    BasicBlock::iterator Next;
>>> +    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
>>> +      Next = std::next(I);
>>> +      MadeChange |= visit(*I);
>>> +    }
>>> +  }
>>> +
>>> +  return MadeChange;
>>> }
>>>
>>> INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
>>> @@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrep
>>>
>>> char AMDGPUCodeGenPrepare::ID = 0;
>>>
>>> -FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) {
>>> +FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
>>>   return new AMDGPUCodeGenPrepare(TM);
>>> }
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp Tue Jul 19 18:16:53 2016
>>> @@ -29,16 +29,39 @@ static const char *const IntrinsicNameTa
>>> #undef GET_INTRINSIC_NAME_TABLE
>>> };
>>>
>>> -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
>>> -                                         unsigned numTys) const {
>>> -  if (IntrID < Intrinsic::num_intrinsics) {
>>> -    return nullptr;
>>> -  }
>>> +namespace {
>>> +#define GET_INTRINSIC_ATTRIBUTES
>>> +#include "AMDGPUGenIntrinsics.inc"
>>> +#undef GET_INTRINSIC_ATTRIBUTES
>>> +}
>>> +
>>> +StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID,
>>> +                                       ArrayRef<Type *> Tys) const {
>>> +  if (IntrID < Intrinsic::num_intrinsics)
>>> +    return StringRef();
>>> +
>>>   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics &&
>>>          "Invalid intrinsic ID");
>>>
>>> -  std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]);
>>> -  return Result;
>>> +  return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics];
>>> +}
>>> +
>>> +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
>>> +                                         unsigned NumTys) const {
>>> +  return getName(IntrID, makeArrayRef(Tys, NumTys)).str();
>>> +}
>>> +
>>> +FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
>>> +                                           ArrayRef<Type*> Tys) const {
>>> +  // FIXME: Re-use Intrinsic::getType machinery
>>> +  switch (ID) {
>>> +  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
>>> +    Type *F32Ty = Type::getFloatTy(Context);
>>> +    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
>>> +  }
>>> +  default:
>>> +    llvm_unreachable("unhandled intrinsic");
>>> +  }
>>> }
>>>
>>> unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
>>> @@ -69,7 +92,19 @@ bool AMDGPUIntrinsicInfo::isOverloaded(u
>>> }
>>>
>>> Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
>>> +                                              ArrayRef<Type *> Tys) const {
>>> +  FunctionType *FTy = getType(M->getContext(), IntrID, Tys);
>>> +  Function *F
>>> +    = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
>>> +
>>> +  AttributeSet AS = getAttributes(M->getContext(),
>>> +                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
>>> +  F->setAttributes(AS);
>>> +  return F;
>>> +}
>>> +
>>> +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
>>>                                               Type **Tys,
>>> -                                              unsigned numTys) const {
>>> -  llvm_unreachable("Not implemented");
>>> +                                              unsigned NumTys) const {
>>> +  return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys));
>>> }
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h Tue Jul 19 18:16:53 2016
>>> @@ -34,13 +34,23 @@ enum ID {
>>> class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo {
>>> public:
>>>   AMDGPUIntrinsicInfo();
>>> +
>>> +  StringRef getName(unsigned IntrId, ArrayRef<Type *> Tys = None) const;
>>> +
>>>   std::string getName(unsigned IntrId, Type **Tys = nullptr,
>>> -                      unsigned numTys = 0) const override;
>>> +                      unsigned NumTys = 0) const override;
>>> +
>>>   unsigned lookupName(const char *Name, unsigned Len) const override;
>>>   bool isOverloaded(unsigned IID) const override;
>>>   Function *getDeclaration(Module *M, unsigned ID,
>>>                            Type **Tys = nullptr,
>>> -                           unsigned numTys = 0) const override;
>>> +                           unsigned NumTys = 0) const override;
>>> +
>>> +  Function *getDeclaration(Module *M, unsigned ID,
>>> +                           ArrayRef<Type *> = None) const;
>>> +
>>> +  FunctionType *getType(LLVMContext &Context, unsigned ID,
>>> +                        ArrayRef<Type*> Tys = None) const;
>>> };
>>>
>>> } // end namespace llvm
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Tue Jul 19 18:16:53 2016
>>> @@ -309,6 +309,7 @@ public:
>>>   ScheduleDAGInstrs *
>>>   createMachineScheduler(MachineSchedContext *C) const override;
>>>
>>> +  void addIRPasses() override;
>>>   bool addPreISel() override;
>>>   void addMachineSSAOptimization() override;
>>>   bool addInstSelector() override;
>>> @@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimiz
>>>   addPass(&DeadMachineInstructionElimID);
>>> }
>>>
>>> +void GCNPassConfig::addIRPasses() {
>>> +  // TODO: May want to move later or split into an early and late one.
>>> +  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
>>> +
>>> +  AMDGPUPassConfig::addIRPasses();
>>> +}
>>> +
>>> bool GCNPassConfig::addInstSelector() {
>>>   AMDGPUPassConfig::addInstSelector();
>>>   addPass(createSILowerI1CopiesPass());
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Tue Jul 19 18:16:53 2016
>>> @@ -2113,6 +2113,9 @@ SDValue SITargetLowering::LowerINTRINSIC
>>>     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
>>>                                    Op->getVTList(), Ops, VT, MMO);
>>>   }
>>> +  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
>>> +    return lowerFDIV_FAST(Op, DAG);
>>> +  }
>>>   case AMDGPUIntrinsic::SI_vs_load_input:
>>>     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
>>>                        Op.getOperand(1),
>>> @@ -2427,7 +2430,8 @@ SDValue SITargetLowering::LowerSELECT(SD
>>>
>>> // Catch division cases where we can use shortcuts with rcp and rsq
>>> // instructions.
>>> -SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
>>> +SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
>>> +                                              SelectionDAG &DAG) const {
>>>   SDLoc SL(Op);
>>>   SDValue LHS = Op.getOperand(0);
>>>   SDValue RHS = Op.getOperand(1);
>>> @@ -2468,47 +2472,48 @@ SDValue SITargetLowering::LowerFastFDIV(
>>>   return SDValue();
>>> }
>>>
>>> -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
>>> -  if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
>>> -    return FastLowered;
>>> -
>>> +// Faster 2.5 ULP division that does not support denormals.
>>> +SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
>>>   SDLoc SL(Op);
>>> -  SDValue LHS = Op.getOperand(0);
>>> -  SDValue RHS = Op.getOperand(1);
>>> +  SDValue LHS = Op.getOperand(1);
>>> +  SDValue RHS = Op.getOperand(2);
>>> +
>>> +  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
>>>
>>> -  // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
>>> -  if (EnableAMDGPUFastFDIV) {
>>> -    // This does not support denormals.
>>> -    SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
>>> +  const APFloat K0Val(BitsToFloat(0x6f800000));
>>> +  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
>>>
>>> -    const APFloat K0Val(BitsToFloat(0x6f800000));
>>> -    const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
>>> +  const APFloat K1Val(BitsToFloat(0x2f800000));
>>> +  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
>>>
>>> -    const APFloat K1Val(BitsToFloat(0x2f800000));
>>> -    const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
>>> +  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
>>>
>>> -    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
>>> +  EVT SetCCVT =
>>> +    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
>>>
>>> -    EVT SetCCVT =
>>> -        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
>>> +  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
>>>
>>> -    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
>>> +  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
>>>
>>> -    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
>>> +  // TODO: Should this propagate fast-math-flags?
>>> +  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
>>>
>>> -    // TODO: Should this propagate fast-math-flags?
>>> +  // rcp does not support denormals.
>>> +  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
>>>
>>> -    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
>>> +  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
>>>
>>> -    // rcp does not support denormals.
>>> -    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
>>> +  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
>>> +}
>>>
>>> -    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
>>> +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
>>> +  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
>>> +    return FastLowered;
>>>
>>> -    return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
>>> -  }
>>> +  SDLoc SL(Op);
>>> +  SDValue LHS = Op.getOperand(0);
>>> +  SDValue RHS = Op.getOperand(1);
>>>
>>> -  // Generates more precise fpdiv32.
>>>   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
>>>
>>>   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
>>> @@ -2538,7 +2543,7 @@ SDValue SITargetLowering::LowerFDIV32(SD
>>>
>>> SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
>>>   if (DAG.getTarget().Options.UnsafeFPMath)
>>> -    return LowerFastFDIV(Op, DAG);
>>> +    return lowerFastUnsafeFDIV(Op, DAG);
>>>
>>>   SDLoc SL(Op);
>>>   SDValue X = Op.getOperand(0);
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Tue Jul 19 18:16:53 2016
>>> @@ -36,7 +36,8 @@ class SITargetLowering final : public AM
>>>   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
>>>   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
>>>   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
>>> -  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
>>> +  SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
>>> +  SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
>>>   SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
>>>   SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
>>>   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
>>>
>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIIntrinsics.td
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIIntrinsics.td?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/lib/Target/AMDGPU/SIIntrinsics.td (original)
>>> +++ llvm/trunk/lib/Target/AMDGPU/SIIntrinsics.td Tue Jul 19 18:16:53 2016
>>> @@ -7,7 +7,8 @@
>>> //
>>> //===----------------------------------------------------------------------===//
>>> //
>>> -// SI Intrinsic Definitions
>>> +// Backend internal SI Intrinsic Definitions. User code should not
>>> +// directly use these.
>>> //
>>> //===----------------------------------------------------------------------===//
>>>
>>> @@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in
>>> } // End TargetPrefix = "SI", isTarget = 1
>>>
>>> let TargetPrefix = "amdgcn", isTarget = 1 in {
>>> +  // Emit 2.5 ulp, no denormal division. Should only be inserted by
>>> +  // pass based on !fpmath metadata.
>>> +  def int_amdgcn_fdiv_fast : Intrinsic<
>>> +    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
>>> +  >;
>>> +
>>>   /* Control flow Intrinsics */
>>>
>>>   def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
>>>
>>> Modified: llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll (original)
>>> +++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll Tue Jul 19 18:16:53 2016
>>> @@ -1,8 +1,242 @@
>>> -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s
>>> -; RUN: opt -S -amdgpu-codegenprepare < %s
>>> +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
>>> +; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
>>> ; Make sure this doesn't crash with no triple
>>>
>>> -; CHECK-LABEL: @foo(
>>> -define void @foo() {
>>> +; NOOP-LABEL: @noop_fdiv_fpmath(
>>> +; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
>>> +define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
>>> +  %md.25ulp = fdiv float %a, %b, !fpmath !0
>>> +  store volatile float %md.25ulp, float addrspace(1)* %out
>>>   ret void
>>> }
>>> +
>>> +; CHECK-LABEL: @fdiv_fpmath(
>>> +; CHECK: %no.md = fdiv float %a, %b{{$}}
>>> +; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
>>> +; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
>>> +; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
>>> +; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
>>> +; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
>>> +; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
>>> +define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
>>> +  %no.md = fdiv float %a, %b
>>> +  store volatile float %no.md, float addrspace(1)* %out
>>> +
>>> +  %md.half.ulp = fdiv float %a, %b, !fpmath !1
>>> +  store volatile float %md.half.ulp, float addrspace(1)* %out
>>> +
>>> +  %md.1ulp = fdiv float %a, %b, !fpmath !2
>>> +  store volatile float %md.1ulp, float addrspace(1)* %out
>>> +
>>> +  %md.25ulp = fdiv float %a, %b, !fpmath !0
>>> +  store volatile float %md.25ulp, float addrspace(1)* %out
>>> +
>>> +  %md.3ulp = fdiv float %a, %b, !fpmath !3
>>> +  store volatile float %md.3ulp, float addrspace(1)* %out
>>> +
>>> +  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
>>> +  store volatile float %fast.md.25ulp, float addrspace(1)* %out
>>> +
>>> +  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
>>> +  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
>>> +
>>> +  ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @rcp_fdiv_fpmath(
>>> +; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
>>> +; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
>>> +; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
>>> +; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
>>> +; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
>>> +; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
>>> +define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
>>> +  %no.md = fdiv float 1.0, %x
>>> +  store volatile float %no.md, float addrspace(1)* %out
>>> +
>>> +  %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
>>> +  store volatile float %md.half.ulp, float addrspace(1)* %out
>>> +
>>> +  %arcp.no.md = fdiv arcp float 1.0, %x
>>> +  store volatile float %arcp.no.md, float addrspace(1)* %out
>>> +
>>> +  %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
>>> +  store volatile float %arcp.25ulp, float addrspace(1)* %out
>>> +
>>> +  %fast.no.md = fdiv fast float 1.0, %x
>>> +  store volatile float %fast.no.md, float addrspace(1)* %out
>>> +
>>> +  %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
>>> +  store volatile float %fast.25ulp, float addrspace(1)* %out
>>> +
>>> +  ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @fdiv_fpmath_vector(
>>> +; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
>>> +; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
>>> +; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
>>> +
>>> +; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
>>> +; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
>>> +; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
>>> +; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
>>> +; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
>>> +; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
>>> +; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
>>> +; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
>>> +define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
>>> +  %no.md = fdiv <2 x float> %a, %b
>>> +  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
>>> +
>>> +  %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
>>> +  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
>>> +  store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
>>> +  store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
>>> +; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
>>> +; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
>>> +; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
>>> +; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
>>> +
>>> +; CHECK: extractelement <2 x float> %x
>>> +; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
>>> +; CHECK: extractelement <2 x float> %x
>>> +; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
>>> +; CHECK: store volatile <2 x float> %arcp.25ulp
>>> +
>>> +; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
>>> +; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
>>> +; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
>>> +define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
>>> +  %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
>>> +  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
>>> +
>>> +  %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
>>> +  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
>>> +  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
>>> +
>>> +  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
>>> +  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
>>> +
>>> +  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
>>> +  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
>>> +  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
>>> +; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
>>> +; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
>>> +; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
>>> +
>>> +; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
>>> +; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
>>> +; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
>>> +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
>>> +; CHECK: store volatile <2 x float> %arcp.25ulp
>>> +
>>> +; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
>>> +; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
>>> +; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
>>> +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0
>>> +; CHECK: store volatile <2 x float> %fast.25ulp
>>> +define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
>>> +  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
>>> +  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
>>> +
>>> +  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
>>> +  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
>>> +
>>> +  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
>>> +  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
>>> +
>>> +  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
>>> +  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
>>> +  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  ret void
>>> +}
>>> +
>>> +; FIXME: Should be able to get fdiv for 1.0 component
>>> +; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
>>> +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
>>> +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
>>> +; CHECK: store volatile <2 x float> %arcp.25ulp
>>> +
>>> +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
>>> +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
>>> +; CHECK: store volatile <2 x float> %fast.25ulp
>>> +define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
>>> +  %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
>>> +
>>> +  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
>>> +  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
>>> +  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
>>> +
>>> +  ret void
>>> +}
>>> +
>>> +; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
>>> +; CHECK: %no.md = fdiv float %a, %b{{$}}
>>> +; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
>>> +; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
>>> +; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
>>> +; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
>>> +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
>>> +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
>>> +define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
>>> +  %no.md = fdiv float %a, %b
>>> +  store volatile float %no.md, float addrspace(1)* %out
>>> +
>>> +  %md.half.ulp = fdiv float %a, %b, !fpmath !1
>>> +  store volatile float %md.half.ulp, float addrspace(1)* %out
>>> +
>>> +  %md.1ulp = fdiv float %a, %b, !fpmath !2
>>> +  store volatile float %md.1ulp, float addrspace(1)* %out
>>> +
>>> +  %md.25ulp = fdiv float %a, %b, !fpmath !0
>>> +  store volatile float %md.25ulp, float addrspace(1)* %out
>>> +
>>> +  %md.3ulp = fdiv float %a, %b, !fpmath !3
>>> +  store volatile float %md.3ulp, float addrspace(1)* %out
>>> +
>>> +  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
>>> +  store volatile float %fast.md.25ulp, float addrspace(1)* %out
>>> +
>>> +  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
>>> +  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
>>> +
>>> +  ret void
>>> +}
>>> +
>>> +attributes #0 = { nounwind optnone noinline }
>>> +attributes #1 = { nounwind }
>>> +attributes #2 = { nounwind "target-features"="+fp32-denormals" }
>>> +
>>> +; CHECK: !0 = !{float 2.500000e+00}
>>> +; CHECK: !1 = !{float 5.000000e-01}
>>> +; CHECK: !2 = !{float 1.000000e+00}
>>> +; CHECK: !3 = !{float 3.000000e+00}
>>> +
>>> +!0 = !{float 2.500000e+00}
>>> +!1 = !{float 5.000000e-01}
>>> +!2 = !{float 1.000000e+00}
>>> +!3 = !{float 3.000000e+00}
>>>
>>> Modified: llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll?rev=276051&r1=276050&r2=276051&view=diff
>>> ==============================================================================
>>> --- llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll (original)
>>> +++ llvm/trunk/test/CodeGen/AMDGPU/fdiv.ll Tue Jul 19 18:16:53 2016
>>> @@ -1,8 +1,4 @@
>>> ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
>>> -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
>>> -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
>>> -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s
>>> -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s
>>> ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
>>>
>>> ; These tests check that fdiv is expanded correctly and also test that the
>>> @@ -15,22 +11,59 @@
>>> ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> +; SI: v_div_scale_f32
>>> +; SI-DAG: v_div_scale_f32
>>>
>>> ; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> +; SI: v_fma_f32
>>> +; SI: v_fma_f32
>>> +; SI: v_mul_f32
>>> +; SI: v_fma_f32
>>> +; SI: v_fma_f32
>>> +; SI: v_fma_f32
>>> +; SI: v_div_fmas_f32
>>> +; SI: v_div_fixup_f32
>>> +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
>>> +entry:
>>> +  %fdiv = fdiv float %a, %b
>>> +  store float %fdiv, float addrspace(1)* %out
>>> +  ret void
>>> +}
>>> +
>>> +; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
>>> +; SI: v_cndmask_b32
>>> +; SI: v_mul_f32
>>> +; SI: v_rcp_f32
>>> +; SI: v_mul_f32
>>> +; SI: v_mul_f32
>>> +define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
>>> +entry:
>>> +  %fdiv = fdiv float %a, %b, !fpmath !0
>>> +  store float %fdiv, float addrspace(1)* %out
>>> +  ret void
>>> +}
>>> +
>>> +; Use correct fdiv
>>> +; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
>>> +; SI: v_fma_f32
>>> +; SI: v_div_fmas_f32
>>> +; SI: v_div_fixup_f32
>>> +define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
>>> +entry:
>>> +  %fdiv = fdiv float %a, %b, !fpmath !0
>>> +  store float %fdiv, float addrspace(1)* %out
>>> +  ret void
>>> +}
>>>
>>> -; I754-DAG: v_div_scale_f32
>>> -; I754-DAG: v_rcp_f32
>>> -; I754-DAG: v_fma_f32
>>> -; I754-DAG: v_mul_f32
>>> -; I754-DAG: v_fma_f32
>>> -; I754-DAG: v_div_fixup_f32
>>> -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
>>> +; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
>>> +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
>>> +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
>>> +; SI-NOT: [[RESULT]]
>>> +; SI: buffer_store_dword [[RESULT]]
>>> +define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
>>> entry:
>>> -  %0 = fdiv float %a, %b
>>> -  store float %0, float addrspace(1)* %out
>>> +  %fdiv = fdiv fast float %a, %b
>>> +  store float %fdiv, float addrspace(1)* %out
>>>   ret void
>>> }
>>>
>>> @@ -38,15 +71,14 @@ entry:
>>> ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) {
>>> +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
>>> +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
>>> +; SI-NOT: [[RESULT]]
>>> +; SI: buffer_store_dword [[RESULT]]
>>> +define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
>>> entry:
>>> -  %0 = fdiv fast float %a, %b
>>> -  store float %0, float addrspace(1)* %out
>>> +  %fdiv = fdiv fast float %a, %b
>>> +  store float %fdiv, float addrspace(1)* %out
>>>   ret void
>>> }
>>>
>>> @@ -54,15 +86,14 @@ entry:
>>> ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) {
>>> +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
>>> +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
>>> +; SI-NOT: [[RESULT]]
>>> +; SI: buffer_store_dword [[RESULT]]
>>> +define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
>>> entry:
>>> -  %0 = fdiv arcp float %a, %b
>>> -  store float %0, float addrspace(1)* %out
>>> +  %fdiv = fdiv arcp float %a, %b
>>> +  store float %fdiv, float addrspace(1)* %out
>>>   ret void
>>> }
>>>
>>> @@ -72,26 +103,24 @@ entry:
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> +; SI: v_div_scale_f32
>>> +; SI: v_div_scale_f32
>>> +; SI: v_div_scale_f32
>>> +; SI: v_div_scale_f32
>>> +define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
>>> +entry:
>>> +  %fdiv = fdiv <2 x float> %a, %b
>>> +  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
>>> +  ret void
>>> +}
>>>
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_fixup_f32
>>> -; I754: v_div_fixup_f32
>>> -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
>>> +; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
>>> +; SI: v_cmp_gt_f32
>>> +; SI: v_cmp_gt_f32
>>> +define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
>>> entry:
>>> -  %0 = fdiv <2 x float> %a, %b
>>> -  store <2 x float> %0, <2 x float> addrspace(1)* %out
>>> +  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
>>> +  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
>>>   ret void
>>> }
>>>
>>> @@ -101,19 +130,12 @@ entry:
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
>>> entry:
>>> -  %0 = fdiv fast <2 x float> %a, %b
>>> -  store <2 x float> %0, <2 x float> addrspace(1)* %out
>>> +  %fdiv = fdiv fast <2 x float> %a, %b
>>> +  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
>>>   ret void
>>> }
>>>
>>> @@ -123,19 +145,12 @@ entry:
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_rcp_f32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
>>> entry:
>>> -  %0 = fdiv arcp <2 x float> %a, %b
>>> -  store <2 x float> %0, <2 x float> addrspace(1)* %out
>>> +  %fdiv = fdiv arcp <2 x float> %a, %b
>>> +  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
>>>   ret void
>>> }
>>>
>>> @@ -149,37 +164,11 @@ entry:
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_scale_f32
>>> -; I754: v_div_fixup_f32
>>> -; I754: v_div_fixup_f32
>>> -; I754: v_div_fixup_f32
>>> -; I754: v_div_fixup_f32
>>> -define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
>>> +; SI: v_div_fixup_f32
>>> +; SI: v_div_fixup_f32
>>> +; SI: v_div_fixup_f32
>>> +; SI: v_div_fixup_f32
>>> +define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
>>>   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
>>>   %a = load <4 x float>, <4 x float> addrspace(1) * %in
>>>   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
>>> @@ -198,24 +187,11 @@ define void @fdiv_v4f32(<4 x float> addr
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
>>>   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
>>>   %a = load <4 x float>, <4 x float> addrspace(1) * %in
>>>   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
>>> @@ -234,24 +210,11 @@ define void @fdiv_v4f32_fast_math(<4 x f
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
>>> ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
>>>
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_rcp_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -; UNSAFE-FP: v_mul_f32_e32
>>> -
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -; SI-DAG: v_rcp_f32
>>> -; SI-DAG: v_mul_f32
>>> -define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +; SI: v_rcp_f32
>>> +define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
>>>   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
>>>   %a = load <4 x float>, <4 x float> addrspace(1) * %in
>>>   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
>>> @@ -259,3 +222,9 @@ define void @fdiv_v4f32_arcp_math(<4 x f
>>>   store <4 x float> %result, <4 x float> addrspace(1)* %out
>>>   ret void
>>> }
>>> +
>>> +attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
>>> +attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
>>> +attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
>>> +
>>> +!0 = !{float 2.500000e+00}
>>>
>>> Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
>>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll?rev=276051&view=auto
>>> ==============================================================================
>>> --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll (added)
>>> +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll Tue Jul 19 18:16:53 2016
>>> @@ -0,0 +1,18 @@
>>> +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
>>> +
>>> +declare float @llvm.amdgcn.fdiv.fast(float, float) #0
>>> +
>>> +; CHECK-LABEL: {{^}}test_fdiv_fast:
>>> +; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
>>> +; CHECK: v_mul_f32_e32
>>> +; CHECK: v_rcp_f32_e32
>>> +; CHECK: v_mul_f32_e32
>>> +; CHECK: v_mul_f32_e32
>>> +define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
>>> +  %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
>>> +  store float %fdiv, float addrspace(1)* %out
>>> +  ret void
>>> +}
>>> +
>>> +attributes #0 = { nounwind readnone }
>>> +attributes #1 = { nounwind }
>>>
>>>
>>> _______________________________________________
>>> llvm-commits mailing list
>>> llvm-commits at lists.llvm.org
>>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits