[llvm] 884acbb - AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare

Fri Feb 7 11:47:07 PST 2020

Author: Changpeng Fang
Date: 2020-02-07T11:46:23-08:00
New Revision: 884acbb9e167d5668e43581630239d688edec8ad

URL: https://github.com/llvm/llvm-project/commit/884acbb9e167d5668e43581630239d688edec8ad
DIFF: https://github.com/llvm/llvm-project/commit/884acbb9e167d5668e43581630239d688edec8ad.diff

LOG: AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare

Summary:
  The accuracy limit to use rcp is adjusted to 1.0 ulp from 2.5 ulp.
Also, afn instead of arcp is used to allow inaccurate rcp to be used.

Reviewers:
  arsenm

Differential Revision: https://reviews.llvm.org/D73588

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
    llvm/test/CodeGen/AMDGPU/fdiv.f16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 76f8d5e8c320..e3cc2a4abdee 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -606,24 +606,23 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
   return true;
 }
 
-// Perform RCP optimizations:
+// Optimize fdiv with rcp:
 //
-// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
-//                                                denormals flushed.
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+//               allowed with unsafe-fp-math or afn.
 //
-// a/b -> a*rcp(b) when fast unsafe rcp is legal.
-static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
-                            IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
-                            bool HasDenormals, bool NeedHighAccuracy) {
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
+                              bool RcpIsAccurate, IRBuilder<> Builder,
+                              Module *Mod) {
 
-  Type *Ty = Den->getType();
-  if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
-                             (HasDenormals || NeedHighAccuracy))
+  if (!AllowInaccurateRcp && !RcpIsAccurate)
     return nullptr;
 
+  Type *Ty = Den->getType();
   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
   if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
-    if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
+    if (AllowInaccurateRcp || RcpIsAccurate) {
       if (CLHS->isExactlyValue(1.0)) {
         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
         // the CI documentation has a worst case error of 1 ulp.
@@ -648,49 +647,63 @@ static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
     }
   }
 
-  if (FastUnsafeRcpLegal) {
+  if (AllowInaccurateRcp) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
     Value *Recip = Builder.CreateCall(Decl, { Den });
-    return Builder.CreateFMul(Num, Recip, "", FPMath);
+    return Builder.CreateFMul(Num, Recip);
   }
   return nullptr;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
-                              bool HasDenormals) {
-  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
-  if (!CNum)
-    return HasDenormals;
+// optimize with fdiv.fast:
+//
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
+//
+// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
+static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
+                                   bool HasDenormals, IRBuilder<> Builder,
+                                   Module *Mod) {
+  // fdiv.fast can achieve 2.5 ULP accuracy.
+  if (ReqdAccuracy < 2.5f)
+    return nullptr;
 
-  if (FastUnsafeRcpLegal)
-    return true;
+  // Only have fdiv.fast for f32.
+  Type *Ty = Den->getType();
+  if (!Ty->isFloatTy())
+    return nullptr;
 
-  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
+  bool NumIsOne = false;
+  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
+    if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
+      NumIsOne = true;
+  }
 
-  // Reciprocal f32 is handled separately without denormals.
-  return HasDenormals ^ IsOne;
-}
+  // fdiv does not support denormals. But 1.0/x is always fine to use it.
+  if (HasDenormals && !NumIsOne)
+    return nullptr;
 
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+  return Builder.CreateCall(Decl, { Num, Den });
+}
 
-// Optimizations is performed based on fpmath, fast math flags as wells as
-// denormals to lower fdiv using either rcp or fdiv.fast.
+// Optimizations is performed based on fpmath, fast math flags as well as
+// denormals to optimize fdiv with either rcp or fdiv.fast.
+//
+// With rcp:
+//   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+//                 allowed with unsafe-fp-math or afn.
 //
-// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
-//                     unsafe-fp-math, fast math flags, denormals and fpmath
-//                     accuracy request.
+//   a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
 //
-// RCP Optimizations:
-//   1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
-//                                                  denormals flushed.
-//   a/b -> a*rcp(b) when fast unsafe rcp is legal.
+// With fdiv.fast:
+//   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
 //
-// Use fdiv.fast:
-//   a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
-//                          fpmath >= 2.5ULP with denormals flushed.
+//   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
 //
-//   1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
-//                          fpmath >= 2.5ULP with denormals.
+// NOTE: rcp is the preference in cases that both are legal.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 
   Type *Ty = FDiv.getType()->getScalarType();
@@ -700,19 +713,17 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     return false;
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
-  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;
+  const float ReqdAccuracy =  FPOp->getFPAccuracy();
 
+  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  // Determine whether it is ok to use rcp based on unsafe-fp-math,
-  // fast math flags, denormals and accuracy request.
-  const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
-          (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
-                                     || FMF.approxFunc()));
+  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
 
-  // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
-  const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
-                           !FastUnsafeRcpLegal;
+  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+  // rcp_f64 is never accurate.
+  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
+            (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);
@@ -730,31 +741,24 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
       Value *NumEltI = Builder.CreateExtractElement(Num, I);
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
-      Value *NewElt = nullptr;
-      if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
-                                           HasFP32Denormals)) {
-        Function *Decl =
-                 Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
-      }
-      if (!NewElt) // Try rcp.
-        NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
-                               FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
-      if (!NewElt)
-        NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
+      // Try rcp first.
+      Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
+                                      RcpIsAccurate, Builder, Mod);
+      if (!NewElt) // Try fdiv.fast.
+        NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
+                                      HasFP32Denormals, Builder, Mod);
+      if (!NewElt) // Keep the original.
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
 
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
-  } else { // Scalar.
-    if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
-                                          HasFP32Denormals)) {
-      Function *Decl =
-               Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-      NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
-    }
-    if (!NewFDiv) { // Try rcp.
-      NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
-                              Mod, HasFP32Denormals, NeedHighAccuracy);
+  } else { // Scalar FDiv.
+    // Try rcp first.
+    NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
+                              Builder, Mod);
+    if (!NewFDiv) { // Try fdiv.fast.
+      NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
+                                     Builder, Mod);
     }
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7cbdd4982cdb..475a2b8f30da 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7418,19 +7418,12 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
 
-  bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
-         (Flags.hasAllowReciprocal() &&
-          ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
-            VT == MVT::f16 ||
-            Flags.hasApproximateFuncs()));
-
-  // Do rcp optimization only when fast unsafe rcp is legal here.
-  // NOTE: We already performed RCP optimization to insert intrinsics in
-  // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
-  // rcp optimization.
-  //   However, there are cases like FREM, which is expended into a sequence
-  // of instructions including FDIV, which may expose new opportunities.
-  if (!FastUnsafeRcpLegal)
+  bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
+                            Flags.hasApproximateFuncs();
+
+  // Without !fpmath accuracy information, we can't do more because we don't
+  // know exactly whether rcp is accurate enough to meet !fpmath requirement.
+  if (!AllowInaccurateRcp)
     return SDValue();
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 6ead8b9ad13e..3096372394bb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -12,14 +12,14 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
 
 ; CHECK-LABEL: @fdiv_fpmath(
 ; CHECK: %no.md = fdiv float %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
-; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
+; CHECK: %md.half.ulp = fdiv float %a, %b
+; CHECK: %md.1ulp = fdiv float %a, %b
+; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
+; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
 ; CHECK: %[[FAST_RCP:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
-; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]], !fpmath !0
-; CHECK: %[[ARCP_RCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b)
-; CHECK: arcp.md.25ulp = fmul arcp float %a, %[[ARCP_RCP]], !fpmath !0
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]]
+; CHECK: %[[AFN_RCP:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: afn.md.25ulp = fmul afn float %a, %[[AFN_RCP]]
 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
@@ -39,8 +39,8 @@ define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float
   %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
   store volatile float %fast.md.25ulp, float addrspace(1)* %out
 
-  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
-  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+  %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0
+  store volatile float %afn.md.25ulp, float addrspace(1)* %out
 
   ret void
 }
@@ -48,9 +48,9 @@ define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float
 ; CHECK-LABEL: @rcp_fdiv_fpmath(
 ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
 ; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x)
-; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x
-; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x
+; CHECK: %afn.no.md = call afn float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %afn.25ulp = call afn float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)
 define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
@@ -63,11 +63,11 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
   %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
   store volatile float %md.half.ulp, float addrspace(1)* %out
 
-  %arcp.no.md = fdiv arcp float 1.0, %x
-  store volatile float %arcp.no.md, float addrspace(1)* %out
+  %afn.no.md = fdiv afn float 1.0, %x
+  store volatile float %afn.no.md, float addrspace(1)* %out
 
-  %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
-  store volatile float %arcp.25ulp, float addrspace(1)* %out
+  %afn.25ulp = fdiv afn float 1.0, %x, !fpmath !0
+  store volatile float %afn.25ulp, float addrspace(1)* %out
 
   %fast.no.md = fdiv fast float 1.0, %x
   store volatile float %fast.no.md, float addrspace(1)* %out
@@ -78,28 +78,6 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
   ret void
 }
 
-; CHECK-LABEL: @rcp_fdiv_arcp_denormal(
-; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0
-; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2
-; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
-; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
-define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 {
-
-  %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0
-  store volatile float %arcp.low.accuracy, float addrspace(1)* %out
-
-  %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2
-  store volatile float %arcp.high.accuracy, float addrspace(1)* %out
-
-  %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0
-  store volatile float %arcp.low.afn, float addrspace(1)* %out
-
-  %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2
-  store volatile float %arcp.high.afn, float addrspace(1)* %out
-
-  ret void
-}
-
 ; CHECK-LABEL: @fdiv_fpmath_vector(
 ; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -113,31 +91,31 @@ define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, floa
 
 ; CHECK: %[[HALF_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[HALF_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
-; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]], !fpmath !1
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]]
 ; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
 ; CHECK: %[[HALF_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
 ; CHECK: %[[HALF_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
-; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]], !fpmath !1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]]
 ; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
 ; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[ONE_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[ONE_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
-; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]], !fpmath !2
+; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]]
 ; CHECK: %[[ONE_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ONE_FDIV0]], i64 0
 ; CHECK: %[[ONE_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
 ; CHECK: %[[ONE_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
-; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]], !fpmath !2
+; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]]
 ; CHECK: %md.1ulp = insertelement <2 x float> %[[ONE_INS0]], float %[[ONE_FDIV1]], i64 1
 ; CHECK: store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
-; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
+; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]])
 ; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
 ; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
-; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
+; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]])
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
 define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
   %no.md = fdiv <2 x float> %a, %b
@@ -165,20 +143,20 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[HALF0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
-; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]]
 ; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
 ; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[HALF_FDIV1:[0-9]+]] =  fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] =  fdiv float 1.000000e+00, %[[HALF1]]
 ; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
 ; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
 
-; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
-; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
-; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 1.000000e+00, %[[ARCP_NO1]]
-; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]])
+; CHECK: %[[AFN_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_NO_FDIV0]], i64 0
+; CHECK: %[[AFN_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]])
+; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
@@ -188,13 +166,13 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 ; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_RCP1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
 
-; CHECK: %[[ARCP_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
-; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
-; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_25_RCP1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
-; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_RCP1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]])
+; CHECK: %[[AFN_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_25_RCP0]], i64 0
+; CHECK: %[[AFN_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_25_RCP1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]])
+; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_RCP1]], i64 1
+; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
@@ -210,14 +188,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
   %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
   store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
 
-  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
-  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+  %afn.no.md = fdiv afn <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
 
   %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
   store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
 
-  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
-  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+  %afn.25ulp = fdiv afn <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
 
   %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
   store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
@@ -234,13 +212,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 ; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
 ; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
-; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
-; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
-; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 2.000000e+00, %[[ARCP_NO1]]
-; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]])
+; CHECK: %[[AFN_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_NO_FDIV0]], i64 0
+; CHECK: %[[AFN_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]])
+; CHECK: %[[AFN_NO_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_NO_FDIV1]]
+; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
@@ -251,14 +230,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 ; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
 
-; CHECK: %[[ARCP_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
-; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
-; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_25_RCP1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
-; CHECK: %[[ARCP_25_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_25_RCP1]]
-; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]])
+; CHECK: %[[AFN_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_25_RCP0]], i64 0
+; CHECK: %[[AFN_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_25_RCP1:[0-9]+]] =  call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]])
+; CHECK: %[[AFN_25_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_25_RCP1]]
+; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
 ; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
@@ -272,14 +251,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
-  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
-  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+  %afn.no.md = fdiv afn <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
 
   %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
 
-  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
-  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+  %afn.25ulp = fdiv afn <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
 
   %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
   store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
@@ -288,34 +267,34 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
 }
 
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: %[[ARCP_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
-; CHECK: %[[ARCP_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
-; CHECK: %[[ARCP_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B0]])
-; CHECK: %[[ARCP_MUL0:[0-9]+]] = fmul arcp float %[[ARCP_A0]], %[[ARCP_RCP0]], !fpmath !0
-; CHECK: %[[ARCP_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_MUL0]], i64 0
-; CHECK: %[[ARCP_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
-; CHECK: %[[ARCP_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
-; CHECK: %[[ARCP_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B1]])
-; CHECK: %[[ARCP_MUL1:[0-9]+]] = fmul arcp float %[[ARCP_A1]], %[[ARCP_RCP1]], !fpmath !0
-; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_INS0]], float %[[ARCP_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.25ulp
+; CHECK: %[[AFN_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
+; CHECK: %[[AFN_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
+; CHECK: %[[AFN_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_B0]])
+; CHECK: %[[AFN_MUL0:[0-9]+]] = fmul afn float %[[AFN_A0]], %[[AFN_RCP0]]
+; CHECK: %[[AFN_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_MUL0]], i64 0
+; CHECK: %[[AFN_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
+; CHECK: %[[AFN_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
+; CHECK: %[[AFN_RCP1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_B1]])
+; CHECK: %[[AFN_MUL1:[0-9]+]] = fmul afn float %[[AFN_A1]], %[[AFN_RCP1]]
+; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_INS0]], float %[[AFN_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %afn.25ulp
 
 ; CHECK: %[[FAST_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
 ; CHECK: %[[FAST_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
 ; CHECK: %[[FAST_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B0]])
-; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]], !fpmath !0
+; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]]
 ; CHECK: %[[FAST_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_MUL0]], i64 0
 ; CHECK: %[[FAST_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
 ; CHECK: %[[FAST_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
 ; CHECK: %[[FAST_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B1]])
-; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]], !fpmath !0
+; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]]
 ; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_INS0]], float %[[FAST_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.25ulp
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
 
-  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
-  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+  %afn.25ulp = fdiv afn <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
 
   %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
   store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
@@ -325,13 +304,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
 
 ; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
 ; CHECK: %no.md = fdiv float %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
-; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
-; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
+; CHECK: %md.half.ulp = fdiv float %a, %b
+; CHECK: %md.1ulp = fdiv float %a, %b
+; CHECK: %md.25ulp = fdiv float %a, %b
+; CHECK: %md.3ulp = fdiv float %a, %b
 ; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
-; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0
-; CHECK: %arcp.md.25ulp  = fdiv arcp float %a, %b, !fpmath !0
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]]
+; CHECK: %[[RCP_AFN:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: %afn.md.25ulp  = fmul afn float %a, %[[RCP_AFN]]
 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
@@ -351,8 +331,8 @@ define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, f
   %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
   store volatile float %fast.md.25ulp, float addrspace(1)* %out
 
-  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
-  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+  %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0
+  store volatile float %afn.md.25ulp, float addrspace(1)* %out
 
   ret void
 }
@@ -361,11 +341,6 @@ attributes #0 = { nounwind optnone noinline }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind "target-features"="+fp32-denormals" }
 
-; CHECK: !0 = !{float 2.500000e+00}
-; CHECK: !1 = !{float 5.000000e-01}
-; CHECK: !2 = !{float 1.000000e+00}
-; CHECK: !3 = !{float 3.000000e+00}
-
 !0 = !{float 2.500000e+00}
 !1 = !{float 5.000000e-01}
 !2 = !{float 1.000000e+00}

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 3199c49af4e5..3f61aca713e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -63,7 +63,7 @@ entry:
   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
-  %r.val = fdiv half 1.0, %b.val
+  %r.val = fdiv half 1.0, %b.val, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -82,25 +82,46 @@ entry:
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
   %b.abs = call half @llvm.fabs.f16(half %b.val)
-  %r.val = fdiv half 1.0, %b.abs
+  %r.val = fdiv half 1.0, %b.abs, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
 
-; GCN-LABEL: {{^}}v_rcp_f16_arcp:
+; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.
+
+; GCN-LABEL: {{^}}reciprocal_f16_rounded:
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
+; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]]
+; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]]
+; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]]
+; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
+  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
+  %b.val = load volatile half, half addrspace(1)* %gep.b
+  %r.val = fdiv half 1.0, %b.val
+  store half %r.val, half addrspace(1)* %gep.r
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_rcp_f16_afn:
 ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
 ; GFX8_9_10-NOT: [[VAL]]
 ; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GFX8_9_10-NOT: [[RESULT]]
 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
-  %r.val = fdiv arcp half 1.0, %b.val
+  %r.val = fdiv afn half 1.0, %b.val, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -118,7 +139,7 @@ entry:
   %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
-  %r.val = fdiv half -1.0, %b.val
+  %r.val = fdiv half -1.0, %b.val, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -137,7 +158,7 @@ entry:
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
   %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
-  %r.val = fdiv half 1.0, %b.sqrt
+  %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -157,12 +178,12 @@ entry:
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %b.val = load volatile half, half addrspace(1)* %gep.b
   %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
-  %r.val = fdiv half -1.0, %b.sqrt
+  %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
+; GCN-LABEL: {{^}}v_fdiv_f16_afn:
 ; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
 ; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
 
@@ -170,7 +191,7 @@ entry:
 ; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
 
 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -179,7 +200,7 @@ entry:
   %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
   %a.val = load volatile half, half addrspace(1)* %gep.a
   %b.val = load volatile half, half addrspace(1)* %gep.b
-  %r.val = fdiv arcp half %a.val, %b.val
+  %r.val = fdiv afn half %a.val, %b.val
   store half %r.val, half addrspace(1)* %gep.r
   ret void
 }
@@ -206,38 +227,38 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
+; FUNC-LABEL: {{^}}div_afn_2_x_pat_f16:
 ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 
 ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
 ; GFX8_9_10: buffer_store_short [[MUL]]
-define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
-  %rcp = fdiv arcp half %x, 2.0
+  %rcp = fdiv afn half %x, 2.0
   store half %rcp, half addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
-; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
+; FUNC-LABEL: {{^}}div_afn_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}
 
 ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
 ; GFX8_9_10: buffer_store_short [[MUL]]
-define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
-  %rcp = fdiv arcp half %x, 10.0
+  %rcp = fdiv afn half %x, 10.0
   store half %rcp, half addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
-; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
+; FUNC-LABEL: {{^}}div_afn_neg_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}
 
 ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
 ; GFX8_9_10: buffer_store_short [[MUL]]
-define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
-  %rcp = fdiv arcp half %x, -10.0
+  %rcp = fdiv afn half %x, -10.0
   store half %rcp, half addrspace(1)* %out, align 4
   ret void
 }
@@ -249,3 +270,5 @@ declare half @llvm.fabs.f16(half) #1
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "unsafe-fp-math"="true" }
+
+!0 = !{float 2.500000e+00}