[llvm] ab3a3f5 - AMDGPU/GlobalISel: Update fdiv lowering for denormal/ulp interaction

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 6 09:32:33 PST 2021


Author: Matt Arsenault
Date: 2021-01-06T12:32:01-05:00
New Revision: ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65

URL: https://github.com/llvm/llvm-project/commit/ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65
DIFF: https://github.com/llvm/llvm-project/commit/ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65.diff

LOG: AMDGPU/GlobalISel: Update fdiv lowering for denormal/ulp interaction

Change the GlobalISel fast fdiv handling to match the changes in
2531535984ad989ce88aeee23cb92a827da6686e and
884acbb9e167d5668e43581630239d688edec8ad

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8c733a2afa032..a8e6f27e032bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3053,22 +3053,14 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
-
   uint16_t Flags = MI.getFlags();
-
   LLT ResTy = MRI.getType(Res);
-  LLT S32 = LLT::scalar(32);
-  LLT S64 = LLT::scalar(64);
 
   const MachineFunction &MF = B.getMF();
-  bool Unsafe =
-    MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
+  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+                            MI.getFlag(MachineInstr::FmAfn);
 
-  if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
-    return false;
-
-  if (!Unsafe && ResTy == S32 &&
-      MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
+  if (!AllowInaccurateRcp)
     return false;
 
   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
@@ -3095,17 +3087,13 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
   }
 
   // x / y -> x * (1.0 / y)
-  if (Unsafe) {
-    auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
-      .addUse(RHS)
-      .setMIFlags(Flags);
-    B.buildFMul(Res, LHS, RCP, Flags);
-
-    MI.eraseFromParent();
-    return true;
-  }
+  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+    .addUse(RHS)
+    .setMIFlags(Flags);
+  B.buildFMul(Res, LHS, RCP, Flags);
 
-  return false;
+  MI.eraseFromParent();
+  return true;
 }
 
 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 7775789bd0d2f..c7b9b4f60bc65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -66,7 +66,28 @@ define half @v_fdiv_f16(half %a, half %b) {
 }
 
 define half @v_fdiv_f16_afn(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_afn:
+; GFX6-LABEL: v_fdiv_f16_afn:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_fdiv_f16_afn:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv afn half %a, %b
+  ret half %fdiv
+}
+
+define half @v_fdiv_f16_ulp25(half %a, half %b) {
+; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -85,7 +106,7 @@ define half @v_fdiv_f16_afn(half %a, half %b) {
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_afn:
+; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -107,25 +128,29 @@ define half @v_fdiv_f16_afn(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX89-LABEL: v_fdiv_f16_afn:
+; GFX89-LABEL: v_fdiv_f16_ulp25:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX89-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX89-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv afn half %a, %b
+  %fdiv = fdiv half %a, %b, !fpmath !0
   ret half %fdiv
 }
 
-define half @v_fdiv_f16_ulp25(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
+define half @v_rcp_f16(half %x) {
+; GFX6-IEEE-LABEL: v_rcp_f16:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
 ; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
@@ -133,18 +158,18 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
+; GFX6-FLUSH-LABEL: v_rcp_f16:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
@@ -154,27 +179,27 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX89-LABEL: v_fdiv_f16_ulp25:
+; GFX89-LABEL: v_rcp_f16:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX89-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX89-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX89-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX89-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX89-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX89-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv half %a, %b, !fpmath !0
+  %fdiv = fdiv half 1.0, %x
   ret half %fdiv
 }
 
-define half @v_rcp_f16(half %x) {
-; GFX6-IEEE-LABEL: v_rcp_f16:
+define half @v_rcp_f16_arcp(half %x) {
+; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
@@ -193,7 +218,7 @@ define half @v_rcp_f16(half %x) {
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-FLUSH-LABEL: v_rcp_f16:
+; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
@@ -215,30 +240,15 @@ define half @v_rcp_f16(half %x) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX89-LABEL: v_rcp_f16:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
-; GFX89-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv half 1.0, %x
-  ret half %fdiv
-}
-
-define half @v_rcp_f16_arcp(half %x) {
-; GFX6-LABEL: v_rcp_f16_arcp:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX89-LABEL: v_rcp_f16_arcp:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX89-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp half 1.0, %x
   ret half %fdiv
@@ -316,7 +326,28 @@ define half @v_rcp_f16_ulp25(half %x) {
 }
 
 define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv afn half %a, %b, !fpmath !0
+  ret half %fdiv
+}
+
+define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
+; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -335,7 +366,7 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -357,32 +388,15 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
-; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX89-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv afn half %a, %b, !fpmath !0
-  ret half %fdiv
-}
-
-define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
-; GFX6-LABEL: v_fdiv_f16_arcp_ulp25:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX89-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX89:       ; %bb.0:
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX89-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX89-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp half %a, %b, !fpmath !0
   ret half %fdiv
@@ -508,76 +522,20 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 }
 
 define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
-; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn:
-; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
-; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn:
-; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v7, v5, v5
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v8, -v4, v7, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, v8, v5, v7
-; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v6
-; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
-; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-LABEL: v_fdiv_v2f16_afn:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_afn:
 ; GFX8:       ; %bb.0:
@@ -799,8 +757,18 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX8-LABEL: v_rcp_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -809,35 +777,113 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX9-LABEL: v_rcp_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
   ret <2 x half> %fdiv
 }
 
 define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
-; GFX6-LABEL: v_rcp_v2f16_arcp:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16_arcp:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 16
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -846,10 +892,21 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX9-LABEL: v_rcp_v2f16_arcp:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
   ret <2 x half> %fdiv
@@ -987,7 +1044,49 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 }
 
 define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v1, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
+  ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
+; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1020,7 +1119,7 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1058,69 +1157,48 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, 16
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT:    v_and_or_b32 v0, v2, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
-  ret <2 x half> %fdiv
-}
-
-define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-LABEL: v_fdiv_v2f16_arcp_ulp25:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v1, 16
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX8-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v7, v5
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT:    v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v5
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v4, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
   ret <2 x half> %fdiv

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 02114a058c891..a29c96b93f56f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -157,11 +157,23 @@ define float @v_rcp_f32(float %x) {
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-FLUSH-LABEL: v_rcp_f32:
-; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-FLUSH-LABEL: v_rcp_f32:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_rcp_f32:
 ; GFX89-IEEE:       ; %bb.0:
@@ -178,16 +190,96 @@ define float @v_rcp_f32(float %x) {
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_f32:
+; GFX89-FLUSH:       ; %bb.0:
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float 1.0, %x
   ret float %fdiv
 }
 
 define float @v_rcp_f32_arcp(float %x) {
-; GCN-LABEL: v_rcp_f32_arcp:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_f32_arcp:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_f32_arcp:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_rcp_f32_arcp:
+; GFX89-IEEE:       ; %bb.0:
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX89-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_f32_arcp:
+; GFX89-FLUSH:       ; %bb.0:
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v1, v4, v2
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v3, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp float 1.0, %x
   ret float %fdiv
 }
@@ -237,12 +329,21 @@ define float @v_fdiv_f32_afn_ulp25(float %a, float %b) {
 }
 
 define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
-; GCN-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
-; GCN-IEEE:       ; %bb.0:
-; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
-; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
@@ -256,6 +357,22 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
+; GFX89-IEEE:       ; %bb.0:
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX89-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp float %a, %b, !fpmath !0
   ret float %fdiv
 }
@@ -500,12 +617,36 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-FLUSH-LABEL: v_rcp_v2f32:
-; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
-; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-FLUSH-LABEL: v_rcp_v2f32:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v2, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v5, v2
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v3, v5, v2
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_rcp_v2f32:
 ; GFX89-IEEE:       ; %bb.0:
@@ -534,17 +675,158 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_v2f32:
+; GFX89-FLUSH:       ; %bb.0:
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v3, v5, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, v2, v5, v5
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX89-FLUSH-NEXT:    v_fma_f32 v6, -v3, v5, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v2, v5
+; GFX89-FLUSH-NEXT:    v_fma_f32 v3, -v3, v5, v4
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
   ret <2 x float> %fdiv
 }
 
 define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
-; GCN-LABEL: v_rcp_v2f32_arcp:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_rcp_f32_e32 v1, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_v2f32_arcp:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v2, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v5, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v3, v5, v2
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f32_arcp:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v2, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v5, v2
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v3, v5, v2
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_rcp_v2f32_arcp:
+; GFX89-IEEE:       ; %bb.0:
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v6, v2
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v7, v3
+; GFX89-IEEE-NEXT:    v_fma_f32 v8, -v2, v6, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v9, -v3, v7, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v6, v8, v6, v6
+; GFX89-IEEE-NEXT:    v_fma_f32 v7, v9, v7, v7
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v8, v4, v6
+; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v2, v8, v4
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v9, v5, v7
+; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v3, v9, v5
+; GFX89-IEEE-NEXT:    v_fma_f32 v8, v10, v6, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v8, v4
+; GFX89-IEEE-NEXT:    v_fma_f32 v9, v11, v7, v9
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v6, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v3, -v3, v9, v5
+; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v7, v9
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_v2f32_arcp:
+; GFX89-FLUSH:       ; %bb.0:
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v3, v5, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, v2, v5, v5
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX89-FLUSH-NEXT:    v_fma_f32 v6, -v3, v5, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v2, v5
+; GFX89-FLUSH-NEXT:    v_fma_f32 v3, -v3, v5, v4
+; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
   ret <2 x float> %fdiv
 }
@@ -604,14 +886,32 @@ define <2 x float> @v_fdiv_v2f32_afn_ulp25(<2 x float> %a, <2 x float> %b) {
 }
 
 define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
-; GCN-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
-; GCN-IEEE:       ; %bb.0:
-; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GCN-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v5, v6, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v5, v6, v2
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
@@ -631,6 +931,34 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v5, v0
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, v4, v1
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
+; GFX89-IEEE:       ; %bb.0:
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v7, s[4:5], v1, v3, v1
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v8, v4
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v9, v5
+; GFX89-IEEE-NEXT:    v_fma_f32 v10, -v4, v8, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v11, -v5, v9, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v8, v10, v8, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v9, v11, v9, v9
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v10, v6, v8
+; GFX89-IEEE-NEXT:    v_fma_f32 v12, -v4, v10, v6
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v11, v7, v9
+; GFX89-IEEE-NEXT:    v_fma_f32 v13, -v5, v11, v7
+; GFX89-IEEE-NEXT:    v_fma_f32 v10, v12, v8, v10
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v4, v10, v6
+; GFX89-IEEE-NEXT:    v_fma_f32 v11, v13, v9, v11
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v5, v11, v7
+; GFX89-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v5, v5, v9, v11
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
+; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   ret <2 x float> %fdiv
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 3b9e56b1a7425..b7546223f7d9f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -410,21 +410,12 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
-; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
-; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
-; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
 ; CI-NEXT:    v_mov_b32_e32 v2, s4
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -438,21 +429,12 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
-; VI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
-; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
-; VI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
+; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
index 9bef474d08e39..4554bc81ef937 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
@@ -308,21 +308,60 @@ body: |
     ; SI-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; SI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
-    ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
-    ; SI: $vgpr0 = COPY [[FMUL]](s32)
+    ; SI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; SI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+    ; SI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+    ; SI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+    ; SI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+    ; SI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+    ; SI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+    ; SI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+    ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+    ; SI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+    ; SI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+    ; SI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+    ; SI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+    ; SI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+    ; SI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+    ; SI: $vgpr0 = COPY [[INT6]](s32)
     ; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
-    ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
-    ; VI: $vgpr0 = COPY [[FMUL]](s32)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; VI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+    ; VI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+    ; VI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+    ; VI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+    ; VI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+    ; VI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+    ; VI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+    ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+    ; VI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+    ; VI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+    ; VI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+    ; VI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+    ; VI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+    ; VI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+    ; VI: $vgpr0 = COPY [[INT6]](s32)
     ; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
-    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
-    ; GFX9: $vgpr0 = COPY [[FMUL]](s32)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GFX9: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+    ; GFX9: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+    ; GFX9: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+    ; GFX9: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+    ; GFX9: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+    ; GFX9: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+    ; GFX9: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+    ; GFX9: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+    ; GFX9: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+    ; GFX9: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+    ; GFX9: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+    ; GFX9: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+    ; GFX9: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+    ; GFX9: $vgpr0 = COPY [[INT6]](s32)
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -332,9 +371,22 @@ body: |
     ; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
-    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
-    ; GFX10: $vgpr0 = COPY [[FMUL]](s32)
+    ; GFX10: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GFX10: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+    ; GFX10: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+    ; GFX10: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+    ; GFX10: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+    ; GFX10: S_DENORM_MODE 15, implicit-def $mode, implicit $mode
+    ; GFX10: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+    ; GFX10: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+    ; GFX10: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+    ; GFX10: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+    ; GFX10: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+    ; GFX10: S_DENORM_MODE 12, implicit-def $mode, implicit $mode
+    ; GFX10: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+    ; GFX10: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+    ; GFX10: $vgpr0 = COPY [[INT6]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s32) = arcp G_FDIV %0, %1
@@ -1898,16 +1950,28 @@ body: |
     ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI: $vgpr0 = COPY [[ANYEXT]](s32)
     ; VI-LABEL: name: test_fdiv_s16_constant_one_rcp
+    ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
     ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
-    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+    ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+    ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+    ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+    ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; VI: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp
+    ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
-    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+    ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+    ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+    ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -1916,10 +1980,16 @@ body: |
     ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp
+    ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
-    ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+    ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+    ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+    ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+    ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s16) = G_FCONSTANT half 1.0
     %1:_(s32) = COPY $vgpr0
@@ -1958,18 +2028,28 @@ body: |
     ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI: $vgpr0 = COPY [[ANYEXT]](s32)
     ; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+    ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
     ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
-    ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
-    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+    ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+    ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+    ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+    ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+    ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; VI: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+    ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
     ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
-    ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
-    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+    ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+    ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+    ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+    ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+    ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
     ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -1979,11 +2059,16 @@ body: |
     ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+    ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
     ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
-    ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
-    ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+    ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+    ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+    ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+    ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+    ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s16) = G_FCONSTANT half -1.0
     %1:_(s32) = COPY $vgpr0


        


More information about the llvm-commits mailing list