[llvm] 2531535 - AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare

Thu Jan 23 16:58:15 PST 2020

Author: Changpeng Fang
Date: 2020-01-23T16:57:43-08:00
New Revision: 2531535984ad989ce88aeee23cb92a827da6686e

URL: https://github.com/llvm/llvm-project/commit/2531535984ad989ce88aeee23cb92a827da6686e
DIFF: https://github.com/llvm/llvm-project/commit/2531535984ad989ce88aeee23cb92a827da6686e.diff

LOG: AMDGPU: Implement FDIV optimizations in AMDGPUCodeGenPrepare

    Summary:
      RCP has the accuracy limit. If FDIV fpmath require high accuracy rcp may not
    meet the requirement. However, in DAG lowering, fpmath information gets lost,
    and thus we may generate either inaccurate rcp related computation or slow code
    for fdiv.

    In patch implements fdiv optimizations in the AMDGPUCodeGenPrepare, which could
    exactly know !fpmath.

     FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
                         unsafe-fp-math, fast math flags, denormals and fpmath
                         accuracy request.

     RCP Optimizations:
       1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
                                                      denormals flushed.
       a/b -> a*rcp(b) when fast unsafe rcp is legal.

     Use fdiv.fast:
       a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
                              fpmath >= 2.5ULP with denormals flushed.

       1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
                              fpmath >= 2.5ULP with denormals.

    Reviewers:
      arsenm

    Differential Revision:
      https://reviews.llvm.org/D71293

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
    llvm/test/CodeGen/AMDGPU/fdiv.ll
    llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
    llvm/test/CodeGen/AMDGPU/fneg-combines.ll
    llvm/test/CodeGen/AMDGPU/known-never-snan.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
    llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
    llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
    llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
    llvm/test/CodeGen/AMDGPU/rsq.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 3ac634b6a47e..76f8d5e8c320 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -606,12 +606,64 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
+// Perform RCP optimizations:
+//
+// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
+//                                                denormals flushed.
+//
+// a/b -> a*rcp(b) when fast unsafe rcp is legal.
+static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
+                            IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
+                            bool HasDenormals, bool NeedHighAccuracy) {
+
+  Type *Ty = Den->getType();
+  if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
+                             (HasDenormals || NeedHighAccuracy))
+    return nullptr;
+
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
+  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
+    if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
+      if (CLHS->isExactlyValue(1.0)) {
+        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+        // the CI documentation has a worst case error of 1 ulp.
+        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+        // use it as long as we aren't trying to use denormals.
+        //
+        // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+        // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
+        //       insert rsq intrinsic here.
+
+        // 1.0 / x -> rcp(x)
+        return Builder.CreateCall(Decl, { Den });
+       }
+
+       // Same as for 1.0, but expand the sign out of the constant.
+       if (CLHS->isExactlyValue(-1.0)) {
+         // -1.0 / x -> rcp (fneg x)
+         Value *FNeg = Builder.CreateFNeg(Den);
+         return Builder.CreateCall(Decl, { FNeg });
+       }
+    }
+  }
+
+  if (FastUnsafeRcpLegal) {
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    Value *Recip = Builder.CreateCall(Decl, { Den });
+    return Builder.CreateFMul(Num, Recip, "", FPMath);
+  }
+  return nullptr;
+}
+
+static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
+                              bool HasDenormals) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
     return HasDenormals;
 
-  if (UnsafeDiv)
+  if (FastUnsafeRcpLegal)
     return true;
 
   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
@@ -620,44 +672,57 @@ static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
   return HasDenormals ^ IsOne;
 }
 
-// Insert an intrinsic for fast fdiv for safe math situations where we can
-// reduce precision. Leave fdiv for situations where the generic node is
-// expected to be optimized.
+
+// Optimizations is performed based on fpmath, fast math flags as wells as
+// denormals to lower fdiv using either rcp or fdiv.fast.
+//
+// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
+//                     unsafe-fp-math, fast math flags, denormals and fpmath
+//                     accuracy request.
+//
+// RCP Optimizations:
+//   1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
+//                                                  denormals flushed.
+//   a/b -> a*rcp(b) when fast unsafe rcp is legal.
+//
+// Use fdiv.fast:
+//   a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
+//                          fpmath >= 2.5ULP with denormals flushed.
+//
+//   1/x -> fdiv.fast(1,x)  when RCP optimization is not performed and
+//                          fpmath >= 2.5ULP with denormals.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
-  Type *Ty = FDiv.getType();
 
-  if (!Ty->getScalarType()->isFloatTy())
-    return false;
+  Type *Ty = FDiv.getType()->getScalarType();
 
-  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  if (!FPMath)
+  // No intrinsic for fdiv16 if target does not support f16.
+  if (Ty->isHalfTy() && !ST->has16BitInsts())
     return false;
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
-  float ULP = FPOp->getFPAccuracy();
-  if (ULP < 2.5f)
-    return false;
+  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
+  const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;
 
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
-                                      FMF.allowReciprocal();
+  // Determine whether it is ok to use rcp based on unsafe-fp-math,
+  // fast math flags, denormals and accuracy request.
+  const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
+          (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
+                                     || FMF.approxFunc()));
 
-  // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (UnsafeDiv)
-    return false;
+  // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
+  const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
+                           !FastUnsafeRcpLegal;
 
-  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
 
   Value *NewFDiv = nullptr;
-
-  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+  if (VectorType *VT = dyn_cast<VectorType>(FDiv.getType())) {
     NewFDiv = UndefValue::get(VT);
 
     // FIXME: Doesn't do the right thing for cases where the vector is partially
@@ -665,19 +730,32 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
       Value *NumEltI = Builder.CreateExtractElement(Num, I);
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
-      Value *NewElt;
-
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
-        NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
-      } else {
-        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
+      Value *NewElt = nullptr;
+      if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
+                                           HasFP32Denormals)) {
+        Function *Decl =
+                 Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
       }
+      if (!NewElt) // Try rcp.
+        NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
+                               FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
+      if (!NewElt)
+        NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
 
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
-  } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
-      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  } else { // Scalar.
+    if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
+                                          HasFP32Denormals)) {
+      Function *Decl =
+               Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+      NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
+    }
+    if (!NewFDiv) { // Try rcp.
+      NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
+                              Mod, HasFP32Denormals, NeedHighAccuracy);
+    }
   }
 
   if (NewFDiv) {

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2f7f75530bb6..84f26cccd7d0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7474,49 +7474,54 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
 
-  if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
+  bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
+         (Flags.hasAllowReciprocal() &&
+          ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
+            VT == MVT::f16 ||
+            Flags.hasApproximateFuncs()));
+
+  // Do rcp optimization only when fast unsafe rcp is legal here.
+  // NOTE: We already performed RCP optimization to insert intrinsics in
+  // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
+  // rcp optimization.
+  //   However, there are cases like FREM, which is expended into a sequence
+  // of instructions including FDIV, which may expose new opportunities.
+  if (!FastUnsafeRcpLegal)
     return SDValue();
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
-    if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
-      if (CLHS->isExactlyValue(1.0)) {
-        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
-        // the CI documentation has a worst case error of 1 ulp.
-        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
-        // use it as long as we aren't trying to use denormals.
-        //
-        // v_rcp_f16 and v_rsq_f16 DO support denormals.
-
-        // 1.0 / sqrt(x) -> rsq(x)
-
-        // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
-        // error seems really high at 2^29 ULP.
-        if (RHS.getOpcode() == ISD::FSQRT)
-          return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
-        // 1.0 / x -> rcp(x)
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-      }
+    if (CLHS->isExactlyValue(1.0)) {
+      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+      // the CI documentation has a worst case error of 1 ulp.
+      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+      // use it as long as we aren't trying to use denormals.
+      //
+      // v_rcp_f16 and v_rsq_f16 DO support denormals.
 
-      // Same as for 1.0, but expand the sign out of the constant.
-      if (CLHS->isExactlyValue(-1.0)) {
-        // -1.0 / x -> rcp (fneg x)
-        SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
-      }
+      // 1.0 / sqrt(x) -> rsq(x)
+
+      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+      // error seems really high at 2^29 ULP.
+      if (RHS.getOpcode() == ISD::FSQRT)
+        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+      // 1.0 / x -> rcp(x)
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
     }
-  }
 
-  if (Unsafe) {
-    // Turn into multiply by the reciprocal.
-    // x / y -> x * (1.0 / y)
-    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
+    // Same as for 1.0, but expand the sign out of the constant.
+    if (CLHS->isExactlyValue(-1.0)) {
+      // -1.0 / x -> rcp (fneg x)
+      SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+    }
   }
 
-  return SDValue();
+  // Turn into multiply by the reciprocal.
+  // x / y -> x * (1.0 / y)
+  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
 }
 
 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
@@ -8663,6 +8668,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
                            N->getFlags());
   }
 
+  if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
+    return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
+                           N0.getOperand(0), N->getFlags());
+  }
+
   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 0c7160df2b96..6ead8b9ad13e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -16,8 +16,10 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
 ; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
-; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+; CHECK: %[[FAST_RCP:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]], !fpmath !0
+; CHECK: %[[ARCP_RCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: arcp.md.25ulp = fmul arcp float %a, %[[ARCP_RCP]], !fpmath !0
 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
@@ -45,12 +47,12 @@ define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float
 
 ; CHECK-LABEL: @rcp_fdiv_fpmath(
 ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
-; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
+; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x)
 ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
-; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
-; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
-; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
+; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x
+; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)
 define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
   store volatile float %no.md, float addrspace(1)* %out
@@ -76,10 +78,58 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
   ret void
 }
 
+; CHECK-LABEL: @rcp_fdiv_arcp_denormal(
+; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0
+; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2
+; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
+define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 {
+
+  %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0
+  store volatile float %arcp.low.accuracy, float addrspace(1)* %out
+
+  %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2
+  store volatile float %arcp.high.accuracy, float addrspace(1)* %out
+
+  %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0
+  store volatile float %arcp.low.afn, float addrspace(1)* %out
+
+  %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2
+  store volatile float %arcp.high.afn, float addrspace(1)* %out
+
+  ret void
+}
+
 ; CHECK-LABEL: @fdiv_fpmath_vector(
-; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float %[[NO_A0]], %[[NO_B0]]
+; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
+; CHECK: %[[NO_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[NO_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float %[[NO_A1]], %[[NO_B1]]
+; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[HALF_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[HALF_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]], !fpmath !1
+; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
+; CHECK: %[[HALF_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[HALF_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]], !fpmath !1
+; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ONE_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[ONE_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]], !fpmath !2
+; CHECK: %[[ONE_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ONE_FDIV0]], i64 0
+; CHECK: %[[ONE_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[ONE_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]], !fpmath !2
+; CHECK: %md.1ulp = insertelement <2 x float> %[[ONE_INS0]], float %[[ONE_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
 
 ; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
 ; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -106,12 +156,52 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
 }
 
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
-; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
-; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0
+; CHECK: %[[NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]]
+; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
+; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[NO1]]
+; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[HALF0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1
+; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
+; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] =  fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1
+; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
+; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
+; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 1.000000e+00, %[[ARCP_NO1]]
+; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
+; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0
+; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_NO_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
+; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_RCP1]], i64 1
+; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
+; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
+; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_25_RCP1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
+; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_RCP1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
+; CHECK: %[[FAST_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_25_RCP0]], i64 0
+; CHECK: %[[FAST_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_25_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]])
+; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_RCP1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
@@ -136,12 +226,48 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
 }
 
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
-; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
-; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
-; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
-; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
-; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0
-; CHECK: store volatile <2 x float> %fast.25ulp
+; CHECK: %[[NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[NO_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[NO0]]
+; CHECK: %[[NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[NO_FDIV0]], i64 0
+; CHECK: %[[NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[NO_FDIV1:[0-9]+]] = fdiv float 2.000000e+00, %[[NO1]]
+; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
+; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
+; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] =  fdiv arcp float 2.000000e+00, %[[ARCP_NO1]]
+; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_NO0:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
+; CHECK: %[[FAST_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_NO_RCP0]], i64 0
+; CHECK: %[[FAST_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_NO_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO1]])
+; CHECK: %[[FAST_NO_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_NO_RCP1]]
+; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[ARCP_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
+; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
+; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[ARCP_25_RCP1:[0-9]+]] =  call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
+; CHECK: %[[ARCP_25_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_25_RCP1]]
+; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+; CHECK: %[[FAST_250:[0-9]+]] =  extractelement <2 x float> %x, i64 0
+; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
+; CHECK: %[[FAST_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_25_RCP0]], i64 0
+; CHECK: %[[FAST_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[FAST_25_RCP1:[0-9]+]] =  call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_251]])
+; CHECK: %[[FAST_25_MUL1:[0-9]+]] = fmul fast float 2.000000e+00, %[[FAST_25_RCP1]]
+; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_25_INS0]], float %[[FAST_25_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
@@ -161,12 +287,29 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
   ret void
 }
 
-; FIXME: Should be able to get fdiv for 1.0 component
 ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
+; CHECK: %[[ARCP_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
+; CHECK: %[[ARCP_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
+; CHECK: %[[ARCP_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B0]])
+; CHECK: %[[ARCP_MUL0:[0-9]+]] = fmul arcp float %[[ARCP_A0]], %[[ARCP_RCP0]], !fpmath !0
+; CHECK: %[[ARCP_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_MUL0]], i64 0
+; CHECK: %[[ARCP_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
+; CHECK: %[[ARCP_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
+; CHECK: %[[ARCP_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B1]])
+; CHECK: %[[ARCP_MUL1:[0-9]+]] = fmul arcp float %[[ARCP_A1]], %[[ARCP_RCP1]], !fpmath !0
+; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_INS0]], float %[[ARCP_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %arcp.25ulp
 
-; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
+; CHECK: %[[FAST_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
+; CHECK: %[[FAST_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
+; CHECK: %[[FAST_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B0]])
+; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]], !fpmath !0
+; CHECK: %[[FAST_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_MUL0]], i64 0
+; CHECK: %[[FAST_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
+; CHECK: %[[FAST_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
+; CHECK: %[[FAST_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B1]])
+; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]], !fpmath !0
+; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_INS0]], float %[[FAST_MUL1]], i64 1
 ; CHECK: store volatile <2 x float> %fast.25ulp
 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
@@ -186,8 +329,9 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
 ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
 ; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
-; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0
+; CHECK: %arcp.md.25ulp  = fdiv arcp float %a, %b, !fpmath !0
 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index bd4deb14aade..74ad62159053 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -284,6 +284,68 @@ define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out,
   ret void
 }
 
+; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt:
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX10: s_denorm_mode 15
+; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX10: s_denorm_mode 12
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+
+define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 {
+entry:
+  %fdiv = fdiv float 1.000000e+00, %a
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+
+; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt:
+
+; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
+; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
+
+; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; PREGFX10-NOT: s_setreg
+; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
+; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
+; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
+; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
+; PREGFX10-NOT: s_setreg
+
+; GFX10-NOT: s_denorm_mode
+; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
+; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
+; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
+; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
+; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
+; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
+; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
+; GFX10-NOT: s_denorm_mode
+
+; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
+; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
+define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 {
+entry:
+  %fdiv = fdiv float 1.000000e+00, %a
+  store float %fdiv, float addrspace(1)* %out
+  ret void
+}
+
+
 attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" }
 attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
 attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index 01499e681eaf..c02a21efce5f 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -348,7 +348,7 @@ define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %div = fdiv fast float 1.000000e+00, %load
+  %div = fdiv fast float 1.000000e+00, %load, !fpmath !0
   store float %div, float addrspace(1)* %arg, align 4
   ret void
 }
@@ -359,7 +359,7 @@ define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %div = fdiv fast float -1.000000e+00, %load
+  %div = fdiv fast float -1.000000e+00, %load, !fpmath !0
   store float %div, float addrspace(1)* %arg, align 4
   ret void
 }
@@ -370,7 +370,7 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %neg = fsub float -0.000000e+00, %load
+  %neg = fsub float -0.000000e+00, %load, !fpmath !0
   %div = fdiv fast float 1.000000e+00, %neg
   store float %div, float addrspace(1)* %arg, align 4
   ret void
@@ -382,22 +382,18 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
 ; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
 define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
-  %neg = fsub float -0.000000e+00, %load
+  %neg = fsub float -0.000000e+00, %load, !fpmath !0
   %div = fdiv fast float -1.000000e+00, %neg
   store float %div, float addrspace(1)* %arg, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv float 1.000000e+00, %load
@@ -406,15 +402,11 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg
 }
 
 ; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %div = fdiv float -1.000000e+00, %load
@@ -423,15 +415,11 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load
@@ -441,15 +429,11 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)
 }
 
 ; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM-DAG: v_rcp_f32_e32
-; GCN-DENORM-DAG: v_div_scale_f32
-; GCN-DENORM:     v_div_fmas_f32
-; GCN-DENORM:     v_div_fixup_f32
-
-; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
-; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN-DAG: v_div_scale_f32
+; GCN-DAG: v_rcp_f32_e32
+; GCN-DAG: v_div_scale_f32
+; GCN:     v_div_fmas_f32
+; GCN:     v_div_fixup_f32
 define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
   %load = load float, float addrspace(1)* %arg, align 4
   %neg = fsub float -0.000000e+00, %load

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index 5dda92dbd5ec..133afd4d32cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -219,13 +219,30 @@ define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %
 ; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
 ; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
 ; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
+define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
+.entry:
+  %tmp7 = fdiv float 1.000000e+00, %tmp6
+  %tmp8 = fmul float 0.000000e+00, %tmp7
+  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
+  %.i188 = fadd float %tmp9, 0.000000e+00
+  %tmp10 = fcmp uge float %.i188, %tmp2
+  %tmp11 = fsub float -0.000000e+00, %.i188
+  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
+  %tmp12 = fcmp ule float %.i092, 0.000000e+00
+  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
+  ret float %.i198
+}
+
+; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
+; function attribute unsafe-fp-math automatically. Combine with the previous test
+; when that is done.
+; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
 ; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
 ; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
 ; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
 ; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
 ; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
-
-define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
+define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
 .entry:
   %tmp7 = fdiv float 1.000000e+00, %tmp6
   %tmp8 = fmul float 0.000000e+00, %tmp7
@@ -2524,3 +2541,4 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "unsafe-fp-math"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
index cf3b3ccbc07f..25f110f9126a 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -11,7 +11,7 @@ define float @v_test_known_not_snan_fabs_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, |v0|, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.fabs.f32(float %a.nnan.add)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -22,10 +22,10 @@ define float @v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_med3_f32 v0, -v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = fsub float -0.0, %a.nnan.add
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -71,7 +71,7 @@ define float @v_test_known_not_snan_copysign_input_fmed3_r_i_i_f32(float %a, flo
 ; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.copysign.f32(float %a.nnan.add, float %sign)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -101,7 +101,7 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b.nnan.add)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -166,7 +166,7 @@ define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -182,7 +182,7 @@ define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b.nnan.add)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -215,7 +215,7 @@ define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
   %med = call float @llvm.minnum.f32(float %max, float 4.0)
@@ -232,7 +232,7 @@ define float @v_test_known_not_snan_select_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
   %cmp = icmp eq i32 %c, 0
   %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b.nnan.add
@@ -269,7 +269,7 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %a.nnan.add = fdiv nnan float 1.0, %a
+  %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %cmp = icmp eq i32 %c, 0
   %known.not.snan = select i1 %cmp, float %a.nnan.add, float %b
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -669,3 +669,5 @@ declare float @llvm.amdgcn.cubeid(float, float, float) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+
+!0 = !{float 2.500000e+00}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index ad2d84b7911b..a3c08038b87b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -37,7 +37,7 @@ define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 {
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
-  %rcp = fdiv float 1.0, %src
+  %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -47,7 +47,7 @@ define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %o
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
-  %rcp = fdiv float 1.0, %src
+  %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -61,8 +61,7 @@ define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
-; SI: v_sqrt_f32_e32
-; SI: v_rcp_f32_e32
+; SI: v_rsq_f32_e32
 define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
@@ -144,3 +143,5 @@ attributes #1 = { nounwind "unsafe-fp-math"="false" "target-features"="-fp32-den
 attributes #2 = { nounwind "unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
 attributes #3 = { nounwind "unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
 attributes #4 = { nounwind "unsafe-fp-math"="true" "target-features"="+fp32-denormals" }
+
+!0 = !{float 2.500000e+00}

diff  --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index d6b717411de7..9f717df480fb 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -112,7 +112,7 @@ bb:
 
 bb19:                                             ; preds = %bb
   %tmp20 = uitofp i32 %arg6 to float
-  %tmp21 = fdiv float 1.000000e+00, %tmp20
+  %tmp21 = fdiv float 1.000000e+00, %tmp20, !fpmath !0
   %tmp22 = and i32 %arg6, 16777215
   br label %bb23
 
@@ -258,3 +258,5 @@ declare float @llvm.fmuladd.f32(float, float, float) #1
 
 attributes #0 = { nounwind willreturn }
 attributes #1 = { nounwind readnone speculatable }
+
+!0 = !{float 2.500000e+00}

diff  --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index fbdaeb829297..b7552b06f089 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -10,7 +10,7 @@
 
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
-  %rcp = fdiv float 1.0, %src
+  %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -71,7 +71,7 @@ define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %ou
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
-  %rcp = fdiv float 1.0, %src.fabs
+  %rcp = fdiv float 1.0, %src.fabs, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -83,7 +83,7 @@ define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src
 
 ; EG: RECIP_IEEE
 define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
-  %rcp = fdiv float -1.0, %src
+  %rcp = fdiv float -1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -95,7 +95,7 @@ define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src)
 define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
-  %rcp = fdiv float 1.0, %src.fabs.fneg
+  %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
@@ -109,7 +109,7 @@ define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float
 define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
-  %rcp = fdiv float 1.0, %src.fabs.fneg
+  %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0
   store volatile float %rcp, float addrspace(1)* %out, align 4
 
   %other = fmul float %src, %src.fabs.fneg

diff  --git a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
index 6fb680e62987..badaae3af237 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
@@ -5,7 +5,7 @@
 define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %cvt = uitofp i32 %load to float
-  %div = fdiv float 1.000000e+00, %cvt
+  %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -15,7 +15,9 @@ define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %
 define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %cvt = sitofp i32 %load to float
-  %div = fdiv float 1.000000e+00, %cvt
+  %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
+
+!0 = !{float 2.500000e+00}

diff  --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll
index 40c3c94246e3..8480f344601b 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.ll
@@ -11,7 +11,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone
 define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
-  %div = fdiv float 1.0, %sqrt
+  %div = fdiv float 1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -33,7 +33,7 @@ define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double add
 ; SI: s_endpgm
 define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
-  %div = fdiv float 1.0, %sqrt
+  %div = fdiv float 1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -41,15 +41,17 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float
 ; Recognize that this is rsqrt(a) * rcp(b) * c,
 ; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
 
+; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
+
 ; SI-LABEL: @rsqrt_fmul
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 
-; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]]
-; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]]
-; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]]
-; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+; SI-UNSAFE-DAG: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], [[A]]
+; SI-UNSAFE-DAG: v_mul_f32_e32  [[MUL:v[0-9]+]], [[SQRT]], [[B]]
+; SI-UNSAFE-DAG: v_rcp_f32_e32  [[RCP:v[0-9]+]], [[MUL]]
+; SI-UNSAFE-DAG: v_mul_f32_e32  [[RESULT:v[0-9]+]], [[C]], [[RCP]]
 ; SI-UNSAFE: buffer_store_dword [[RESULT]]
 
 ; SI-SAFE-NOT: v_rsq_f32
@@ -78,13 +80,13 @@ define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
 ; SI-SAFE: buffer_store_dword [[RSQ]]
 
-; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}}
-; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
-; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
+; SI-UNSAFE: v_sqrt_f32_e32 [[SQRT:v[0-9]+]], v{{[0-9]+}}
+; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
+; SI-UNSAFE: buffer_store_dword [[RSQ]]
 define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val)
-  %div = fdiv float -1.0, %sqrt
+  %div = fdiv float -1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -109,14 +111,14 @@ define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double
 ; SI-SAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
 ; SI-SAFE: buffer_store_dword [[RSQ]]
 
-; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}}
-; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
-; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
+; SI-UNSAFE: v_sqrt_f32_e64 [[SQRT:v[0-9]+]], -v{{[0-9]+}}
+; SI-UNSAFE: v_rcp_f32_e64 [[RSQ:v[0-9]+]], -[[SQRT]]
+; SI-UNSAFE: buffer_store_dword [[RSQ]]
 define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %val.fneg = fsub float -0.0, %val
   %sqrt = call float @llvm.sqrt.f32(float %val.fneg)
-  %div = fdiv float -1.0, %sqrt
+  %div = fdiv float -1.0, %sqrt, !fpmath !0
   store float %div, float addrspace(1)* %out, align 4
   ret void
 }
@@ -136,3 +138,5 @@ define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, do
   store double %div, double addrspace(1)* %out, align 4
   ret void
 }
+
+!0 = !{float 2.500000e+00}